Source code for sssom.parsers

"""SSSOM parsers."""

import io
import itertools as itt
import json
import logging as _logging
import re
import typing
from collections import ChainMap, Counter
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, TextIO, Tuple, Union, cast
from xml.dom import Node, minidom
from xml.dom.minidom import Document

import curies
import numpy as np
import pandas as pd
import requests
import yaml
from curies import Converter
from linkml_runtime.loaders.json_loader import JSONLoader
from linkml_runtime.loaders.rdflib_loader import RDFLibLoader
from pandas.errors import EmptyDataError
from rdflib import Graph
from sssom_schema import Mapping, MappingSet

from sssom.constants import (
    CONFIDENCE,
    CURIE_MAP,
    DEFAULT_MAPPING_PROPERTIES,
    LICENSE,
    MAPPING_JUSTIFICATION,
    MAPPING_JUSTIFICATION_UNSPECIFIED,
    MAPPING_SET_ID,
    OBJECT_ID,
    OBJECT_LABEL,
    OBJECT_SOURCE,
    OBJECT_SOURCE_ID,
    OBO_HAS_DB_XREF_URI,
    OWL_EQUIV_CLASS_URI,
    PREDICATE_ID,
    RDF_TYPE,
    RDF_TYPE_URI,
    RDFS_SUBCLASS_OF,
    SKOS_BROAD_MATCH,
    SKOS_BROAD_MATCH_URI,
    SKOS_EXACT_MATCH,
    SKOS_EXACT_MATCH_URI,
    SKOS_NARROW_MATCH,
    SKOS_NARROW_MATCH_URI,
    SUBJECT_ID,
    SUBJECT_LABEL,
    SUBJECT_SOURCE,
    SUBJECT_SOURCE_ID,
    MetadataType,
    _get_sssom_schema_object,
    get_default_metadata,
)

from .context import ConverterHint, _get_built_in_prefix_map, ensure_converter
from .sssom_document import MappingSetDocument
from .util import (
    SSSOM_DEFAULT_RDF_SERIALISATION,
    MappingSetDataFrame,
    get_file_extension,
    is_multivalued_slot,
    raise_for_bad_path,
    safe_compress,
    to_mapping_set_dataframe,
)

logging = _logging.getLogger(__name__)

# * *******************************************************
# Parsers (from file)


def _open_input(input: Union[str, Path, TextIO]) -> io.StringIO:
    """Transform a URL, a filepath (from pathlib), or a string (with file contents) to a StringIO object.

    :param input: A string representing a URL, a filepath, or file contents,
                              or a Path object representing a filepath.
    :return: A StringIO object containing the input data.
    """
    # If the import already is a StrinIO, return it
    if isinstance(input, io.StringIO):
        return input
    elif isinstance(input, Path):
        input = str(input)

    if isinstance(input, str):
        if input.startswith("http://") or input.startswith("https://"):
            # It's a URL
            data = requests.get(input, timeout=30).content
            return io.StringIO(data.decode("utf-8"))
        elif "\n" in input or "\r" in input:
            # It's string data
            return io.StringIO(input)
        else:
            # It's a local file path
            with open(input, "r") as file:
                file_content = file.read()
            return io.StringIO(file_content)

    raise IOError(f"Could not determine the type of input {input}")


def _separate_metadata_and_table_from_stream(s: io.StringIO):
    s.seek(0)

    # Create a new StringIO object for filtered data
    table_component = io.StringIO()
    metadata_component = io.StringIO()

    header_section = True

    # Filter out lines starting with '#'
    for line in s:
        if not line.startswith("#"):
            table_component.write(line)
            if header_section:
                header_section = False
        elif header_section:
            # We strip any trailing tabs. Such tabs may have been left
            # by a spreadsheet editor who treated the header lines as
            # if they were normal data lines; they would prevent the
            # YAML parser from correctly parsing the metadata block.
            metadata_component.write(line.rstrip("\t\n") + "\n")
        else:
            logging.info(
                f"Line {line} is starting with hash symbol, but header section is already passed. "
                f"This line is skipped"
            )

    # Reset the cursor to the start of the new StringIO object
    table_component.seek(0)
    metadata_component.seek(0)
    return table_component, metadata_component


def _read_pandas_and_metadata(input: io.StringIO, sep: str = None):
    """Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly.

    :param input: The file to read. If no separator is given, this file should be named.
    :param sep: File separator for pandas
    :return: A pandas dataframe
    """
    table_stream, metadata_stream = _separate_metadata_and_table_from_stream(input)

    try:
        df = pd.read_csv(table_stream, sep=sep, dtype=str, engine="python")
        df.fillna("", inplace=True)
    except EmptyDataError as e:
        logging.warning(f"Seems like the dataframe is empty: {e}")
        df = pd.DataFrame(
            columns=[
                SUBJECT_ID,
                SUBJECT_LABEL,
                PREDICATE_ID,
                OBJECT_ID,
                MAPPING_JUSTIFICATION,
            ]
        )

    if isinstance(df, pd.DataFrame):
        sssom_metadata = _read_metadata_from_table(metadata_stream)
        return df, sssom_metadata

    return None, None


def _get_seperator_symbol_from_file_path(file):
    r"""
    Take as an input a filepath and return the seperate symbol used, for example, by pandas.

    :param file: the file path
    :return: the seperator symbols as a string, e.g. '\t'
    """
    if isinstance(file, Path) or isinstance(file, str):
        extension = get_file_extension(file)
        if extension == "tsv":
            return "\t"
        elif extension == "csv":
            return ","
        logging.warning(f"Could not guess file extension for {file}")
    return None


def _is_check_valid_extension_slot(slot_name, meta):
    extension_definitions = meta.get("extension_definitions", [])
    return any(
        "property" in entry and entry.get("slot_name") == slot_name
        for entry in extension_definitions
    )


def _is_irregular_metadata(metadata_list: List[Dict]):
    fail_metadata = False
    for m in metadata_list:
        for key in m:
            if key not in _get_sssom_schema_object().mapping_set_slots:
                if not _is_check_valid_extension_slot(key, m):
                    logging.warning(
                        f"Metadata key '{key}' is not a standard SSSOM mapping set metadata field. See "
                        f"https://mapping-commons.github.io/sssom/spec-model/#non-standard-slots on how to "
                        f"specify additional, non-standard fields in a SSSOM file."
                    )
                    fail_metadata = True
    return fail_metadata


def _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map):

    # There are three ways in which prefixes can be communicated, so we will check all of them
    # This is a bit overly draconian, as in the end, only the highest priority one gets picked
    # But since this only constitues a (logging) warning, I think its worth reporting
    builtin_converter = _get_built_in_prefix_map()
    sssom_metadata_converter = _get_converter_pop_replace_curie_map(sssom_metadata)
    meta_converter = _get_converter_pop_replace_curie_map(meta)
    prefix_map_converter = ensure_converter(prefix_map, use_defaults=False)
    is_valid_prefixes = True

    for converter in [sssom_metadata_converter, meta_converter, prefix_map_converter]:
        for builtin_prefix, builtin_uri in builtin_converter.bimap.items():
            if builtin_prefix in converter.bimap:
                if builtin_uri != converter.bimap[builtin_prefix]:
                    logging.warning(
                        f"A built-in prefix ({builtin_prefix}) was provided, "
                        f"but the provided URI expansion ({converter.bimap[builtin_prefix]}) does not correspond "
                        f"to the required URI expansion: {builtin_uri}. The prefix will be ignored."
                    )
                    is_valid_prefixes = False
            # NOTE during refactor replace the following line by https://github.com/biopragmatics/curies/pull/136
            reverse_bimap = {value: key for key, value in builtin_converter.bimap.items()}
            if builtin_uri in reverse_bimap:
                if builtin_prefix != reverse_bimap[builtin_uri]:
                    logging.warning(
                        f"A built-in URI namespace ({builtin_uri}) was used in (one of) the provided prefix map(s), "
                        f"but the provided prefix ({reverse_bimap[builtin_uri]}) does not correspond to the "
                        f"standard prefix: {builtin_prefix}. The prefix will be ignored."
                    )
                    is_valid_prefixes = False
    return is_valid_prefixes


def _fail_in_strict_parsing_mode(is_valid_built_in_prefixes, is_valid_metadata):
    report = ""
    if not is_valid_built_in_prefixes:
        report += "STRONG WARNING: The prefix map provided contains built-in prefixes that were redefined.+\n"
    if not is_valid_metadata:
        report += (
            "STRONG WARNING: The metadata provided contains non-standard and undefined metadata.+\n"
        )

    if report:
        raise ValueError(report)


def _get_converter_pop_replace_curie_map(sssom_metadata):
    """
    Pop CURIE_MAP from sssom_metadata, process it, and restore it if it existed.

    Args:
        sssom_metadata (dict): The metadata dictionary.

    Returns:
        Converter: A Converter object created from the CURIE_MAP.
    """
    curie_map = sssom_metadata.pop(CURIE_MAP, {})

    # Process the popped value
    sssom_metadata_converter = Converter.from_prefix_map(curie_map)

    # Reinsert CURIE_MAP if it was present
    if curie_map:
        sssom_metadata[CURIE_MAP] = curie_map

    return sssom_metadata_converter


[docs] def parse_sssom_table( file_path: Union[str, Path, TextIO], prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, **kwargs, ) -> MappingSetDataFrame: """Parse a TSV to a :class:`MappingSetDocument` to a :class:`MappingSetDataFrame`.""" if isinstance(file_path, Path) or isinstance(file_path, str): raise_for_bad_path(file_path) stream: io.StringIO = _open_input(file_path) sep_new = _get_seperator_symbol_from_file_path(file_path) df, sssom_metadata = _read_pandas_and_metadata(stream, sep_new) if meta is None: meta = {} is_valid_built_in_prefixes = _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map) is_valid_metadata = _is_irregular_metadata([sssom_metadata, meta]) if kwargs.get("strict"): _fail_in_strict_parsing_mode(is_valid_built_in_prefixes, is_valid_metadata) # The priority order for combining prefix maps are: # 1. Built-in prefix map # 2. Internal prefix map inside the document # 3. Prefix map passed through this function inside the ``meta`` # 4. Prefix map passed through this function to ``prefix_map`` (handled with ensure_converter) converter = curies.chain( [ _get_built_in_prefix_map(), Converter.from_prefix_map(sssom_metadata.pop(CURIE_MAP, {})), Converter.from_prefix_map(meta.pop(CURIE_MAP, {})), ensure_converter(prefix_map, use_defaults=False), ] ) # The priority order for combining metadata is: # 1. Metadata appearing in the SSSOM document # 2. Metadata passed through ``meta`` to this function # 3. Default metadata combine_meta = dict( ChainMap( sssom_metadata, meta, get_default_metadata(), ) ) msdf = from_sssom_dataframe(df, prefix_map=converter, meta=combine_meta) return msdf
[docs] def parse_sssom_rdf( file_path: str, prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, serialisation=SSSOM_DEFAULT_RDF_SERIALISATION, **kwargs, # mapping_predicates: Optional[List[str]] = None, ) -> MappingSetDataFrame: """Parse a TSV to a :class:`MappingSetDocument` to a :class:`MappingSetDataFrame`.""" raise_for_bad_path(file_path) g = Graph() g.parse(file_path, format=serialisation) # Initialize meta if it's None if meta is None: meta = {} # The priority order for combining prefix maps are: # 1. Built-in prefix map # 2. Internal prefix map inside the document # 3. Prefix map passed through this function inside the ``meta`` # 4. Prefix map passed through this function to ``prefix_map`` (handled with ensure_converter) converter = curies.chain( [ _get_built_in_prefix_map(), Converter.from_rdflib(g), Converter.from_prefix_map(meta.pop(CURIE_MAP, {})), ensure_converter(prefix_map, use_defaults=False), ] ) msdf = from_sssom_rdf(g, prefix_map=converter, meta=meta) # df: pd.DataFrame = msdf.df # if mapping_predicates and not df.empty(): # msdf.df = df[df["predicate_id"].isin(mapping_predicates)] return msdf
[docs] def parse_sssom_json( file_path: str, prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, **kwargs ) -> MappingSetDataFrame: """Parse a TSV to a :class:`MappingSetDocument` to a :class:`MappingSetDataFrame`.""" raise_for_bad_path(file_path) with open(file_path) as json_file: jsondoc = json.load(json_file) # Initialize meta if it's None if meta is None: meta = {} # The priority order for combining prefix maps are: # 1. Built-in prefix map # 2. Internal prefix map inside the document # 3. Prefix map passed through this function inside the ``meta`` # 4. Prefix map passed through this function to ``prefix_map`` (handled with ensure_converter) converter = curies.chain( [ _get_built_in_prefix_map(), Converter.from_jsonld(file_path), Converter.from_prefix_map(meta.pop(CURIE_MAP, {})), ensure_converter(prefix_map, use_defaults=False), ] ) msdf = from_sssom_json(jsondoc=jsondoc, prefix_map=converter, meta=meta) return msdf
# Import methods from external file formats
[docs] def parse_obographs_json( file_path: str, prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, mapping_predicates: Optional[List[str]] = None, ) -> MappingSetDataFrame: """Parse an obographs file as a JSON object and translates it into a MappingSetDataFrame. :param file_path: The path to the obographs file :param prefix_map: an optional prefix map :param meta: an optional dictionary of metadata elements :param mapping_predicates: an optional list of mapping predicates that should be extracted :return: A SSSOM MappingSetDataFrame """ raise_for_bad_path(file_path) converter, meta = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta) with open(file_path) as json_file: jsondoc = json.load(json_file) return from_obographs( jsondoc, prefix_map=converter, meta=meta, mapping_predicates=mapping_predicates, )
def _get_prefix_map_and_metadata( prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None ) -> Tuple[Converter, MetadataType]: if meta is None: meta = get_default_metadata() converter = curies.chain( [ _get_built_in_prefix_map(), Converter.from_prefix_map(meta.pop(CURIE_MAP, {})), ensure_converter(prefix_map, use_defaults=False), ] ) return converter, meta def _address_multivalued_slot(k: str, v: Any) -> Union[str, List[str]]: if isinstance(v, str) and is_multivalued_slot(k): # IF k is multivalued, then v = List[values] return [s.strip() for s in v.split("|")] else: return v def _init_mapping_set(meta: Optional[MetadataType]) -> MappingSet: _metadata = dict(ChainMap(meta or {}, get_default_metadata())) mapping_set = MappingSet(mapping_set_id=_metadata[MAPPING_SET_ID], license=_metadata[LICENSE]) _set_metadata_in_mapping_set(mapping_set=mapping_set, metadata=meta) return mapping_set def _get_mapping_dict( row: pd.Series, bad_attrs: Counter, mapping_slots: typing.Set[str] ) -> Dict[str, Any]: """Generate a mapping dictionary from a given row of data. It also updates the 'bad_attrs' counter for keys that are not present in the sssom_schema_object's mapping_slots. """ # Populate the mapping dictionary with key-value pairs from the row, # only if the value exists, is not NaN, and the key is in the schema's mapping slots. # The value could be a string or a list and is handled accordingly via _address_multivalued_slot(). mdict = { k: _address_multivalued_slot(k, v) for k, v in row.items() if v and pd.notna(v) and k in mapping_slots } # Update bad_attrs for keys not in mapping_slots bad_keys = set(row.keys()) - mapping_slots for bad_key in bad_keys: bad_attrs[bad_key] += 1 return mdict
[docs] def parse_alignment_xml( file_path: str, prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, mapping_predicates: Optional[List[str]] = None, ) -> MappingSetDataFrame: """Parse a TSV -> MappingSetDocument -> MappingSetDataFrame.""" raise_for_bad_path(file_path) converter, meta = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta) logging.info("Loading from alignment API") xmldoc = minidom.parse(file_path) msdf = from_alignment_minidom( xmldoc, prefix_map=converter, meta=meta, mapping_predicates=mapping_predicates, ) return msdf
# Readers (from object)
[docs] def from_sssom_dataframe( df: pd.DataFrame, prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, ) -> MappingSetDataFrame: """Convert a dataframe to a MappingSetDataFrame. :param df: A mappings dataframe :param prefix_map: A prefix map :param meta: A metadata dictionary :return: MappingSetDataFrame """ converter = ensure_converter(prefix_map) # Need to revisit this solution. # This is to address: A value is trying to be set on a copy of a slice from a DataFrame if CONFIDENCE in df.columns: df.replace({CONFIDENCE: r"^\s*$"}, np.nan, regex=True, inplace=True) mapping_set = _get_mapping_set_from_df(df=df, meta=meta) doc = MappingSetDocument(mapping_set=mapping_set, converter=converter) return to_mapping_set_dataframe(doc)
[docs] def from_sssom_rdf( g: Graph, prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, ) -> MappingSetDataFrame: """Convert an SSSOM RDF graph into a SSSOM data table. :param g: the Graph (rdflib) :param prefix_map: A dictionary containing the prefix map, defaults to None :param meta: Potentially additional metadata, defaults to None :return: MappingSetDataFrame object """ converter = ensure_converter(prefix_map) mapping_set = cast( MappingSet, RDFLibLoader().load( source=g, target_class=MappingSet, schemaview=_get_sssom_schema_object().view, prefix_map=converter.bimap, ignore_unmapped_predicates=True, ), ) # The priority order for combining metadata is: # 1. Metadata appearing in the SSSOM document # 2. Metadata passed through ``meta`` to this function # 3. Default metadata # As the Metadata appearing in the SSSOM document is already parsed by LinkML # we only need to overwrite the metadata from 2 and 3 if it is not present combine_meta = dict( ChainMap( meta or {}, get_default_metadata(), ) ) _set_metadata_in_mapping_set(mapping_set, metadata=combine_meta, overwrite=False) mdoc = MappingSetDocument(mapping_set=mapping_set, converter=converter) return to_mapping_set_dataframe(mdoc)
[docs] def from_sssom_json( jsondoc: Union[str, dict, TextIO], prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, ) -> MappingSetDataFrame: """Load a mapping set dataframe from a JSON object. :param jsondoc: JSON document :param prefix_map: Prefix map :param meta: metadata used to augment the metadata existing in the mapping set :return: MappingSetDataFrame object """ converter = ensure_converter(prefix_map) mapping_set = cast(MappingSet, JSONLoader().load(source=jsondoc, target_class=MappingSet)) # The priority order for combining metadata is: # 1. Metadata appearing in the SSSOM document # 2. Metadata passed through ``meta`` to this function # 3. Default metadata # As the Metadata appearing in the SSSOM document is already parsed by LinkML # we only need to overwrite the metadata from 2 and 3 if it is not present combine_meta = dict( ChainMap( meta or {}, get_default_metadata(), ) ) _set_metadata_in_mapping_set(mapping_set, metadata=combine_meta, overwrite=False) mapping_set_document = MappingSetDocument(mapping_set=mapping_set, converter=converter) return to_mapping_set_dataframe(mapping_set_document)
[docs] def from_alignment_minidom( dom: Document, prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, mapping_predicates: Optional[List[str]] = None, ) -> MappingSetDataFrame: """Read a minidom Document object. :param dom: XML (minidom) object :param prefix_map: A prefix map :param meta: Optional meta data :param mapping_predicates: Optional list of mapping predicates to extract :return: MappingSetDocument :raises ValueError: for alignment format: xml element said, but not set to yes. Only XML is supported! """ converter = ensure_converter(prefix_map) ms = _init_mapping_set(meta) mlist: List[Mapping] = [] # bad_attrs = {} if not mapping_predicates: mapping_predicates = DEFAULT_MAPPING_PROPERTIES alignments = dom.getElementsByTagName("Alignment") for n in alignments: for e in n.childNodes: if e.nodeType == Node.ELEMENT_NODE: node_name = e.nodeName if node_name == "map": cell = e.getElementsByTagName("Cell") for c_node in cell: mdict: Dict[str, Any] = _cell_element_values( c_node, converter, mapping_predicates=mapping_predicates ) _add_valid_mapping_to_list(mdict, mlist, flip_superclass_assertions=True) elif node_name == "xml": if e.firstChild.nodeValue != "yes": raise ValueError( "Alignment format: xml element said, but not set to yes. Only XML is supported!" ) elif node_name == "onto1": ms[SUBJECT_SOURCE_ID] = e.firstChild.nodeValue elif node_name == "onto2": ms[OBJECT_SOURCE_ID] = e.firstChild.nodeValue elif node_name == "uri1": ms[SUBJECT_SOURCE] = e.firstChild.nodeValue elif node_name == "uri2": ms[OBJECT_SOURCE] = e.firstChild.nodeValue ms.mappings = mlist # type: ignore mapping_set_document = MappingSetDocument(mapping_set=ms, converter=converter) return to_mapping_set_dataframe(mapping_set_document)
def _get_obographs_predicate_id(obographs_predicate: str): if obographs_predicate == "is_a": return RDFS_SUBCLASS_OF return obographs_predicate
[docs] def from_obographs( jsondoc: Dict, prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, mapping_predicates: Optional[List[str]] = None, ) -> MappingSetDataFrame: """Convert a obographs json object to an SSSOM data frame. :param jsondoc: The JSON object representing the ontology in obographs format :param prefix_map: The prefix map to be used :param meta: Any additional metadata that needs to be added to the resulting SSSOM data frame, defaults to None :param mapping_predicates: Optional list of mapping predicates to extract :raises Exception: When there is no CURIE :return: An SSSOM data frame (MappingSetDataFrame) """ converter = ensure_converter(prefix_map) ms = _init_mapping_set(meta) mlist: List[Mapping] = [] if not mapping_predicates: mapping_predicates = DEFAULT_MAPPING_PROPERTIES graphs = jsondoc.get("graphs") if not graphs: raise Exception("No graphs element in obographs file, wrong format?") #: A dictionary of node URIs to node labels labels: Mapping[str, str] = { node["id"]: node.get("lbl") for graph in graphs for node in graph.get("nodes", []) if node.get("lbl") } for graph in graphs: for node in graph.get("nodes", []): meta = node.get("meta") if not meta: continue node_uri = node["id"] if OBO_HAS_DB_XREF_URI in mapping_predicates: for xref in meta.get("xrefs", []): mdict = _make_mdict( node_uri, OBO_HAS_DB_XREF_URI, xref["val"], converter, labels ) _add_valid_mapping_to_list(mdict, mlist) for value in meta.get("basicPropertyValues", []): predicate_uri = value["pred"] if predicate_uri not in mapping_predicates: continue mdict = _make_mdict(node_uri, predicate_uri, value["val"], converter, labels) _add_valid_mapping_to_list(mdict, mlist) for edge in graph.get("edges", []): predicate_uri = _get_obographs_predicate_id(edge["pred"]) if predicate_uri not in mapping_predicates: continue mdict = _make_mdict(edge["sub"], predicate_uri, edge["obj"], converter, labels) _add_valid_mapping_to_list(mdict, mlist) if OWL_EQUIV_CLASS_URI in mapping_predicates: for equivalents in graph.get("equivalentNodesSets", []): node_uris = equivalents.get("nodeIds") if not node_uris: continue for subject_uri, object_uri in itt.product(node_uris, repeat=2): if subject_uri == object_uri: continue mdict = _make_mdict( subject_uri, OWL_EQUIV_CLASS_URI, object_uri, converter, labels ) _add_valid_mapping_to_list(mdict, mlist) ms.mappings = mlist # type: ignore mdoc = MappingSetDocument(mapping_set=ms, converter=converter) return to_mapping_set_dataframe(mdoc)
def _make_mdict( subject_id: str, predicate_id: str, object_id: str, converter: Converter, labels: typing.Mapping[str, str], ): mdict = { MAPPING_JUSTIFICATION: MAPPING_JUSTIFICATION_UNSPECIFIED, } try: subject_curie = safe_compress(subject_id, converter) except ValueError as e: logging.debug("could not parse subject %s - %s", subject_id, e) else: mdict[SUBJECT_ID] = subject_curie try: predicate_curie = safe_compress(predicate_id, converter) except ValueError as e: logging.debug("could not parse predicate %s - %s", predicate_id, e) else: mdict[PREDICATE_ID] = predicate_curie try: object_curie = safe_compress(object_id, converter) except ValueError as e: logging.debug("could not parse object %s - %s", object_id, e) else: mdict[OBJECT_ID] = object_curie if subject_id in labels: mdict[SUBJECT_LABEL] = labels[subject_id] if object_id in labels: mdict[OBJECT_LABEL] = labels[object_id] return mdict # All from_* take as an input a python object (data frame, json, etc.) and return a MappingSetDataFrame # All read_* take as an input a file handle and return a MappingSetDataFrame (usually wrapping a from_* method) PARSING_FUNCTIONS: typing.Mapping[str, Callable] = { "tsv": parse_sssom_table, "obographs-json": parse_obographs_json, "alignment-api-xml": parse_alignment_xml, "json": parse_sssom_json, "rdf": parse_sssom_rdf, }
[docs] def get_parsing_function(input_format: Optional[str], filename: str) -> Callable: """Return appropriate parser function based on input format of file. :param input_format: File format :param filename: Filename :raises Exception: Unknown file format :return: Appropriate 'read' function """ if input_format is None: input_format = get_file_extension(filename) func = PARSING_FUNCTIONS.get(input_format) if func is None: raise Exception(f"Unknown input format: {input_format}") return func
def _flip_superclass_assertion(mapping: Mapping) -> Mapping: if mapping.predicate_id != "sssom:superClassOf": return mapping mapping.predicate_id = "rdfs:subClassOf" return _swap_object_subject(mapping) def _swap_object_subject(mapping: Mapping) -> Mapping: members = [ attr.replace("subject_", "") for attr in dir(mapping) if not callable(getattr(mapping, attr)) and not attr.startswith("__") and attr.startswith("subject_") ] for var in members: subject_val = getattr(mapping, "subject_" + var) object_val = getattr(mapping, "object_" + var) setattr(mapping, "subject_" + var, object_val) setattr(mapping, "object_" + var, subject_val) return mapping def _read_metadata_from_table(stream: io.StringIO) -> Dict[str, Any]: yamlstr = "" for line in stream: if line.startswith("#"): yamlstr += re.sub("^#", "", line) else: break if yamlstr: meta = yaml.safe_load(yamlstr) logging.info(f"Meta={meta}") return meta return {} def _set_metadata_in_mapping_set( mapping_set: MappingSet, metadata: Optional[MetadataType] = None, overwrite: bool = True ) -> None: if metadata is None: logging.info("Tried setting metadata but none provided.") else: for k, v in metadata.items(): if k != CURIE_MAP: if ( hasattr(mapping_set, k) and getattr(mapping_set, k) is not None and not overwrite ): continue mapping_set[k] = _address_multivalued_slot(k, v) def _cell_element_values(cell_node, converter: Converter, mapping_predicates) -> Dict[str, Any]: mdict: Dict[str, Any] = {} for child in cell_node.childNodes: if child.nodeType == Node.ELEMENT_NODE: try: if child.nodeName == "entity1": mdict[SUBJECT_ID] = safe_compress(child.getAttribute("rdf:resource"), converter) elif child.nodeName == "entity2": mdict[OBJECT_ID] = safe_compress(child.getAttribute("rdf:resource"), converter) elif child.nodeName == "measure": mdict[CONFIDENCE] = child.firstChild.nodeValue elif child.nodeName == "relation": relation = child.firstChild.nodeValue if (relation == "=") and (SKOS_EXACT_MATCH_URI in mapping_predicates): mdict[PREDICATE_ID] = SKOS_EXACT_MATCH elif (relation == "<") and (SKOS_BROAD_MATCH_URI in mapping_predicates): mdict[PREDICATE_ID] = SKOS_BROAD_MATCH elif (relation == ">") and (SKOS_NARROW_MATCH_URI in mapping_predicates): mdict[PREDICATE_ID] = SKOS_NARROW_MATCH # elif (relation == "%") and (SOMETHING in mapping_predicates) # # Incompatible. # pass # elif (relation == "HasInstance") and (SOMETHING in mapping_predicates): # pass elif (relation == "InstanceOf") and (RDF_TYPE_URI in mapping_predicates): mdict[PREDICATE_ID] = RDF_TYPE else: logging.warning(f"{relation} not a recognised relation type.") else: logging.warning(f"Unsupported alignment api element: {child.nodeName}") except ValueError as e: logging.warning(e) mdict[MAPPING_JUSTIFICATION] = MAPPING_JUSTIFICATION_UNSPECIFIED return mdict # The following methods dont really belong in the parser package..
[docs] def to_mapping_set_document(msdf: MappingSetDataFrame) -> MappingSetDocument: """Convert a MappingSetDataFrame to a MappingSetDocument.""" ms = _get_mapping_set_from_df(df=msdf.df, meta=msdf.metadata) return MappingSetDocument(mapping_set=ms, converter=msdf.converter)
def _get_mapping_set_from_df(df: pd.DataFrame, meta: Optional[MetadataType] = None) -> MappingSet: mapping_set = _init_mapping_set(meta) bad_attrs: Counter = Counter() mapping_slots = set(_get_sssom_schema_object().mapping_slots) df.apply( lambda row: _add_valid_mapping_to_list( _get_mapping_dict(row, bad_attrs, mapping_slots), mapping_set.mappings ), axis=1, ) for k, v in bad_attrs.items(): logging.warning(f"No attr for {k} [{v} instances]") return mapping_set
[docs] def split_dataframe( msdf: MappingSetDataFrame, ) -> Dict[str, MappingSetDataFrame]: """Group the mapping set dataframe into several subdataframes by prefix. :param msdf: MappingSetDataFrame object :raises RuntimeError: DataFrame object within MappingSetDataFrame is None :return: Mapping object """ subject_prefixes = set(msdf.df[SUBJECT_ID].str.split(":", n=1, expand=True)[0]) object_prefixes = set(msdf.df[OBJECT_ID].str.split(":", n=1, expand=True)[0]) relations = set(msdf.df[PREDICATE_ID]) return split_dataframe_by_prefix( msdf=msdf, subject_prefixes=subject_prefixes, object_prefixes=object_prefixes, relations=relations, )
[docs] def split_dataframe_by_prefix( msdf: MappingSetDataFrame, subject_prefixes: Iterable[str], object_prefixes: Iterable[str], relations: Iterable[str], ) -> Dict[str, MappingSetDataFrame]: """Split a mapping set dataframe by prefix. :param msdf: An SSSOM MappingSetDataFrame :param subject_prefixes: a list of prefixes pertaining to the subject :param object_prefixes: a list of prefixes pertaining to the object :param relations: a list of relations of interest :return: a dict of SSSOM data frame names to MappingSetDataFrame """ df = msdf.df meta = msdf.metadata split_to_msdf: Dict[str, MappingSetDataFrame] = {} for subject_prefix, object_prefix, relation in itt.product( subject_prefixes, object_prefixes, relations ): relation_prefix, relation_id = relation.split(":") split = f"{subject_prefix.lower()}_{relation_id.lower()}_{object_prefix.lower()}" if subject_prefix not in msdf.converter.bimap: logging.warning(f"{split} - missing subject prefix - {subject_prefix}") continue if object_prefix not in msdf.converter.bimap: logging.warning(f"{split} - missing object prefix - {object_prefix}") continue df_subset = df[ (df[SUBJECT_ID].str.startswith(subject_prefix + ":")) & (df[PREDICATE_ID] == relation) & (df[OBJECT_ID].str.startswith(object_prefix + ":")) ] if 0 == len(df_subset): logging.debug(f"No matches ({len(df_subset)} matches found)") continue subconverter = msdf.converter.get_subconverter( [subject_prefix, object_prefix, relation_prefix] ) split_to_msdf[split] = from_sssom_dataframe( df_subset, prefix_map=dict(subconverter.bimap), meta=meta ) return split_to_msdf
def _ensure_valid_mapping_from_dict(mdict: Dict[str, Any]): """ Return a valid mapping object if it can be constructed, else None. :param mdict: A dictionary containing the mapping metadata. :return: A valid Mapping object, or None. """ mdict.setdefault(MAPPING_JUSTIFICATION, MAPPING_JUSTIFICATION_UNSPECIFIED) try: m = Mapping(**mdict) if m.subject_type == "rdfs literal": if m.subject_label is None: raise ValueError("Missing subject_label") elif m.subject_id is None: raise ValueError("Missing subject_id") if m.object_type == "rdfs literal": if m.object_label is None: raise ValueError("Missing object_label") elif m.object_id is None: raise ValueError("Missing object_id") except ValueError as e: logging.warning( f"One mapping in the mapping set is not well-formed, " f"and therfore not included in the mapping set ({mdict}). Error: {e}" ) return None else: return m def _add_valid_mapping_to_list( mdict: Dict[str, Any], mlist: List[Mapping], *, flip_superclass_assertions=False ): """ Validate the mapping and append to the list if valid. Parameters: - mdict (dict): A dictionary containing the mapping metadata. - mlist (list): The list to which the valid mapping should be appended. - flip_superclass_assertions (bool): an optional paramter that flips sssom:superClassOf to rdfs:subClassOf """ mapping = _ensure_valid_mapping_from_dict(mdict) if not mapping: return None if flip_superclass_assertions: mapping = _flip_superclass_assertion(mapping) mlist.append(mapping)