Source code for sssom.parsers

"""SSSOM parsers."""

import gzip
import io
import itertools as itt
import json
import logging as _logging
import os.path
import re
import typing
from collections import ChainMap, Counter
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, TextIO, Tuple, Union, cast
from xml.dom import Node, minidom
from xml.dom.minidom import Document

import curies
import numpy as np
import pandas as pd
import requests
import yaml
from curies import Converter
from linkml_runtime.loaders.json_loader import JSONLoader
from linkml_runtime.loaders.rdflib_loader import RDFLibLoader
from pandas.errors import EmptyDataError
from rdflib import Graph
from sssom_schema import Mapping, MappingSet

from sssom.constants import (
    CONFIDENCE,
    CURIE_MAP,
    DEFAULT_MAPPING_PROPERTIES,
    LICENSE,
    MAPPING_JUSTIFICATION,
    MAPPING_JUSTIFICATION_UNSPECIFIED,
    MAPPING_SET_ID,
    OBJECT_ID,
    OBJECT_LABEL,
    OBJECT_SOURCE,
    OBJECT_SOURCE_ID,
    OBO_HAS_DB_XREF_URI,
    OWL_EQUIV_CLASS_URI,
    PREDICATE_ID,
    RDF_TYPE,
    RDF_TYPE_URI,
    RDFS_SUBCLASS_OF,
    SKOS_BROAD_MATCH,
    SKOS_BROAD_MATCH_URI,
    SKOS_EXACT_MATCH,
    SKOS_EXACT_MATCH_URI,
    SKOS_NARROW_MATCH,
    SKOS_NARROW_MATCH_URI,
    SUBJECT_ID,
    SUBJECT_LABEL,
    SUBJECT_SOURCE,
    SUBJECT_SOURCE_ID,
    MetadataType,
    PathOrIO,
    _get_sssom_schema_object,
    get_default_metadata,
)

from .context import ConverterHint, _get_built_in_prefix_map, ensure_converter
from .sssom_document import MappingSetDocument
from .util import (
    SSSOM_DEFAULT_RDF_SERIALISATION,
    ExtensionLiteral,
    MappingSetDataFrame,
    get_file_extension,
    is_multivalued_slot,
    raise_for_bad_path,
    safe_compress,
    to_mapping_set_dataframe,
)

logging = _logging.getLogger(__name__)


# * *******************************************************
# Parsers (from file)


def _open_input(p: PathOrIO) -> TextIO:
    """Transform a URL, a filepath (from pathlib), or a string (with file contents) to a StringIO object.

    :param p:
        A string representing a URL, a filepath, or file contents, or a Path object representing a filepath.
    :return: A StringIO object containing the input data.
    """
    # if we passed an IO object, return it back directly
    if not isinstance(p, (str, Path)):
        return p

    if isinstance(p, str) and (p.startswith("http://") or p.startswith("https://")):
        # It's a URL
        data = requests.get(p, timeout=30).content
        # TODO handle gzipped remote content
        return io.StringIO(data.decode("utf-8"))

    # squash a path to a string so we don't have to duplicate logic below
    if isinstance(p, Path):
        p = p.as_posix()

    if "\n" in p or "\r" in p:
        # It's string data
        return io.StringIO(p)

    if not os.path.exists(p):
        raise FileNotFoundError(f"file does not exist: {p}")

    if p.endswith(".gz"):
        with gzip.open(p, "rt") as file:
            file_content = file.read()
        return io.StringIO(file_content)
    else:
        # It's a local file path
        with open(p, "r") as file:
            file_content = file.read()
        return io.StringIO(file_content)


def _separate_metadata_and_table_from_stream(stream: TextIO):
    stream.seek(0)

    # Create a new StringIO object for filtered data
    table_component = io.StringIO()
    metadata_component = io.StringIO()

    header_section = True

    # Filter out lines starting with '#'
    for line in stream:
        if not line.startswith("#"):
            table_component.write(line)
            if header_section:
                header_section = False
        elif header_section:
            # We strip any trailing tabs. Such tabs may have been left
            # by a spreadsheet editor who treated the header lines as
            # if they were normal data lines; they would prevent the
            # YAML parser from correctly parsing the metadata block.
            metadata_component.write(line.rstrip("\t\n") + "\n")
        else:
            logging.info(
                f"Line {line} is starting with hash symbol, but header section is already passed. "
                f"This line is skipped"
            )

    # Reset the cursor to the start of the new StringIO object
    table_component.seek(0)
    metadata_component.seek(0)
    return table_component, metadata_component


def _read_pandas_and_metadata(file_path: PathOrIO, sep: Optional[str] = None):
    """Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly.

    :param file_path: The file path or stream to read
    :param sep: File separator for pandas
    :return: A pandas dataframe
    """
    if sep is None:
        sep = _infer_separator(file_path)

    if isinstance(file_path, (str, Path)):
        raise_for_bad_path(file_path)

    stream = _open_input(file_path)
    table_stream, metadata_stream = _separate_metadata_and_table_from_stream(stream)

    try:
        df = pd.read_csv(table_stream, sep=sep, dtype=str, engine="python")
    except EmptyDataError as e:
        logging.warning(f"Seems like the dataframe is empty: {e}")
        df = pd.DataFrame(
            columns=[
                SUBJECT_ID,
                SUBJECT_LABEL,
                PREDICATE_ID,
                OBJECT_ID,
                MAPPING_JUSTIFICATION,
            ]
        )
    else:
        df.fillna("", inplace=True)

    sssom_metadata = _read_metadata_from_table(metadata_stream)
    return df, sssom_metadata


EXTENSION_TO_SEP: dict[ExtensionLiteral, str] = {"tsv": "\t", "csv": ","}


def _infer_separator(file: PathOrIO) -> Optional[str]:
    r"""Infer the CSV separator from a file path or IO object.

    :param file: the file path
    :return: the separator symbols as a string, e.g. '\t'
    """
    extension = get_file_extension(file)
    if extension is None:
        return None
    return EXTENSION_TO_SEP[extension]


def _is_check_valid_extension_slot(slot_name, meta):
    extension_definitions = meta.get("extension_definitions", [])
    return any(
        "property" in entry and entry.get("slot_name") == slot_name
        for entry in extension_definitions
    )


def _is_irregular_metadata(metadata_list: List[Dict]):
    fail_metadata = False
    for m in metadata_list:
        for key in m:
            if key not in _get_sssom_schema_object().mapping_set_slots:
                if not _is_check_valid_extension_slot(key, m):
                    logging.warning(
                        f"Metadata key '{key}' is not a standard SSSOM mapping set metadata field. See "
                        f"https://mapping-commons.github.io/sssom/spec-model/#non-standard-slots on how to "
                        f"specify additional, non-standard fields in a SSSOM file."
                    )
                    fail_metadata = True
    return fail_metadata


def _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map):
    # There are three ways in which prefixes can be communicated, so we will check all of them
    # This is a bit overly draconian, as in the end, only the highest priority one gets picked
    # But since this only constitues a (logging) warning, I think its worth reporting
    builtin_converter = _get_built_in_prefix_map()
    sssom_metadata_converter = _get_converter_pop_replace_curie_map(sssom_metadata)
    meta_converter = _get_converter_pop_replace_curie_map(meta)
    prefix_map_converter = ensure_converter(prefix_map, use_defaults=False)
    is_valid_prefixes = True

    for converter in [sssom_metadata_converter, meta_converter, prefix_map_converter]:
        for builtin_prefix, builtin_uri in builtin_converter.bimap.items():
            if builtin_prefix in converter.bimap:
                if builtin_uri != converter.bimap[builtin_prefix]:
                    logging.warning(
                        f"A built-in prefix ({builtin_prefix}) was provided, "
                        f"but the provided URI expansion ({converter.bimap[builtin_prefix]}) does not correspond "
                        f"to the required URI expansion: {builtin_uri}. The prefix will be ignored."
                    )
                    is_valid_prefixes = False
            # NOTE during refactor replace the following line by https://github.com/biopragmatics/curies/pull/136
            reverse_bimap = {value: key for key, value in builtin_converter.bimap.items()}
            if builtin_uri in reverse_bimap:
                if builtin_prefix != reverse_bimap[builtin_uri]:
                    logging.warning(
                        f"A built-in URI namespace ({builtin_uri}) was used in (one of) the provided prefix map(s), "
                        f"but the provided prefix ({reverse_bimap[builtin_uri]}) does not correspond to the "
                        f"standard prefix: {builtin_prefix}. The prefix will be ignored."
                    )
                    is_valid_prefixes = False
    return is_valid_prefixes


def _fail_in_strict_parsing_mode(is_valid_built_in_prefixes, is_valid_metadata):
    report = ""
    if not is_valid_built_in_prefixes:
        report += "STRONG WARNING: The prefix map provided contains built-in prefixes that were redefined.+\n"
    if not is_valid_metadata:
        report += (
            "STRONG WARNING: The metadata provided contains non-standard and undefined metadata.+\n"
        )

    if report:
        raise ValueError(report)


def _get_converter_pop_replace_curie_map(sssom_metadata):
    """
    Pop CURIE_MAP from sssom_metadata, process it, and restore it if it existed.

    Args:
        sssom_metadata (dict): The metadata dictionary.

    Returns:
        Converter: A Converter object created from the CURIE_MAP.
    """
    curie_map = sssom_metadata.pop(CURIE_MAP, {})

    # Process the popped value
    sssom_metadata_converter = Converter.from_prefix_map(curie_map)

    # Reinsert CURIE_MAP if it was present
    if curie_map:
        sssom_metadata[CURIE_MAP] = curie_map

    return sssom_metadata_converter



[docs]
def parse_sssom_table(
    file_path: PathOrIO,
    prefix_map: ConverterHint = None,
    meta: Optional[MetadataType] = None,
    *,
    strict: bool = False,
    sep: Optional[str] = None,
    **kwargs: Any,
) -> MappingSetDataFrame:
    """Parse a SSSOM CSV or TSV file.

    :param file_path:
        A file path, URL, or I/O object that contains SSSOM encoded in TSV
    :param prefix_map:
        A prefix map or :class:`curies.Converter` used to validate prefixes,
        CURIEs, and IRIs appearing in the SSSOM TSV
    :param meta:
        Additional document-level metadata for the SSSOM TSV document that is not
        contained within the document itself. For example, this may come from a
        companion SSSOM YAML file.
    :param strict:
        If true, will fail parsing for undefined prefixes, CURIEs, or IRIs
    :param sep:
        The seperator. If not given, inferred from file name
    :param kwargs:
        Additional keyword arguments (unhandled)
    :returns:
        A parsed dataframe wrapper object
    """
    if kwargs:
        logging.warning("unhandled keyword arguments passed: %s", kwargs)

    df, sssom_metadata = _read_pandas_and_metadata(file_path, sep)
    if meta is None:
        meta = {}

    is_valid_built_in_prefixes = _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map)
    is_valid_metadata = _is_irregular_metadata([sssom_metadata, meta])

    if strict:
        _fail_in_strict_parsing_mode(is_valid_built_in_prefixes, is_valid_metadata)

    # The priority order for combining prefix maps are:
    #  1. Built-in prefix map
    #  2. Internal prefix map inside the document
    #  3. Prefix map passed through this function inside the ``meta``
    #  4. Prefix map passed through this function to ``prefix_map`` (handled with ensure_converter)
    converter = curies.chain(
        [
            _get_built_in_prefix_map(),
            Converter.from_prefix_map(sssom_metadata.pop(CURIE_MAP, {})),
            Converter.from_prefix_map(meta.pop(CURIE_MAP, {})),
            ensure_converter(prefix_map, use_defaults=False),
        ]
    )

    # The priority order for combining metadata is:
    #  1. Metadata appearing in the SSSOM document
    #  2. Metadata passed through ``meta`` to this function
    #  3. Default metadata
    combine_meta = dict(
        ChainMap(
            sssom_metadata,
            meta,
            get_default_metadata(),
        )
    )

    msdf = from_sssom_dataframe(df, prefix_map=converter, meta=combine_meta)
    return msdf




[docs]
def parse_csv(*args, **kwargs) -> MappingSetDataFrame:
    """Parse a SSSOM CSV file, forwarding arguments to :func:`parse_sssom_table`."""
    kwargs["sep"] = ","
    return parse_sssom_table(*args, **kwargs)




[docs]
def parse_tsv(*args, **kwargs) -> MappingSetDataFrame:
    """Parse a SSSOM TSV file, forwarding arguments to :func:`parse_sssom_table`."""
    kwargs["sep"] = "\t"
    return parse_sssom_table(*args, **kwargs)




[docs]
def parse_sssom_rdf(
    file_path: Union[str, Path],
    prefix_map: ConverterHint = None,
    meta: Optional[MetadataType] = None,
    serialisation=SSSOM_DEFAULT_RDF_SERIALISATION,
    **kwargs,
    # mapping_predicates: Optional[List[str]] = None,
) -> MappingSetDataFrame:
    """Parse a TSV to a :class:`MappingSetDocument` to a :class:`MappingSetDataFrame`."""
    raise_for_bad_path(file_path)

    g = Graph()
    g.parse(file_path, format=serialisation)

    # Initialize meta if it's None
    if meta is None:
        meta = {}

    # The priority order for combining prefix maps are:
    #  1. Built-in prefix map
    #  2. Internal prefix map inside the document
    #  3. Prefix map passed through this function inside the ``meta``
    #  4. Prefix map passed through this function to ``prefix_map`` (handled with ensure_converter)
    converter = curies.chain(
        [
            _get_built_in_prefix_map(),
            Converter.from_rdflib(g),
            Converter.from_prefix_map(meta.pop(CURIE_MAP, {})),
            ensure_converter(prefix_map, use_defaults=False),
        ]
    )
    msdf = from_sssom_rdf(g, prefix_map=converter, meta=meta)
    # df: pd.DataFrame = msdf.df
    # if mapping_predicates and not df.empty():
    #     msdf.df = df[df["predicate_id"].isin(mapping_predicates)]
    return msdf




[docs]
def parse_sssom_json(
    file_path: Union[str, Path],
    prefix_map: ConverterHint = None,
    meta: Optional[MetadataType] = None,
    **kwargs,
) -> MappingSetDataFrame:
    """Parse a TSV to a :class:`MappingSetDocument` to a :class:`MappingSetDataFrame`."""
    raise_for_bad_path(file_path)

    with open(file_path) as json_file:
        jsondoc = json.load(json_file)

    # Initialize meta if it's None
    if meta is None:
        meta = {}

    # The priority order for combining prefix maps are:
    #  1. Built-in prefix map
    #  2. Internal prefix map inside the document
    #  3. Prefix map passed through this function inside the ``meta``
    #  4. Prefix map passed through this function to ``prefix_map`` (handled with ensure_converter)
    converter = curies.chain(
        [
            _get_built_in_prefix_map(),
            Converter.from_jsonld(file_path),
            Converter.from_prefix_map(meta.pop(CURIE_MAP, {})),
            ensure_converter(prefix_map, use_defaults=False),
        ]
    )

    msdf = from_sssom_json(jsondoc=jsondoc, prefix_map=converter, meta=meta)
    return msdf



# Import methods from external file formats



[docs]
def parse_obographs_json(
    file_path: Union[str, Path],
    prefix_map: ConverterHint = None,
    meta: Optional[MetadataType] = None,
    mapping_predicates: Optional[List[str]] = None,
) -> MappingSetDataFrame:
    """Parse an obographs file as a JSON object and translates it into a MappingSetDataFrame.

    :param file_path: The path to the obographs file
    :param prefix_map: an optional prefix map
    :param meta: an optional dictionary of metadata elements
    :param mapping_predicates: an optional list of mapping predicates that should be extracted
    :return: A SSSOM MappingSetDataFrame
    """
    raise_for_bad_path(file_path)

    converter, meta = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta)

    with open(file_path) as json_file:
        jsondoc = json.load(json_file)

    return from_obographs(
        jsondoc,
        prefix_map=converter,
        meta=meta,
        mapping_predicates=mapping_predicates,
    )



def _get_prefix_map_and_metadata(
    prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None
) -> Tuple[Converter, MetadataType]:
    if meta is None:
        meta = get_default_metadata()
    converter = curies.chain(
        [
            _get_built_in_prefix_map(),
            Converter.from_prefix_map(meta.pop(CURIE_MAP, {})),
            ensure_converter(prefix_map, use_defaults=False),
        ]
    )
    return converter, meta


def _address_multivalued_slot(k: str, v: Any) -> Union[str, List[str]]:
    if isinstance(v, str) and is_multivalued_slot(k):
        # IF k is multivalued, then v = List[values]
        return [s.strip() for s in v.split("|")]
    else:
        return v


def _init_mapping_set(meta: Optional[MetadataType]) -> MappingSet:
    _metadata = dict(ChainMap(meta or {}, get_default_metadata()))
    mapping_set = MappingSet(mapping_set_id=_metadata[MAPPING_SET_ID], license=_metadata[LICENSE])
    _set_metadata_in_mapping_set(mapping_set=mapping_set, metadata=meta)
    return mapping_set


def _get_mapping_dict(
    row: pd.Series, bad_attrs: Counter, mapping_slots: typing.Set[str]
) -> Dict[str, Any]:
    """Generate a mapping dictionary from a given row of data.

    It also updates the 'bad_attrs' counter for keys that are not present
    in the sssom_schema_object's mapping_slots.
    """
    # Populate the mapping dictionary with key-value pairs from the row,
    # only if the value exists, is not NaN, and the key is in the schema's mapping slots.
    # The value could be a string or a list and is handled accordingly via _address_multivalued_slot().

    mdict = {
        k: _address_multivalued_slot(k, v)
        for k, v in row.items()
        if v and pd.notna(v) and k in mapping_slots
    }

    # Update bad_attrs for keys not in mapping_slots
    bad_keys = set(row.keys()) - mapping_slots
    for bad_key in bad_keys:
        bad_attrs[bad_key] += 1
    return mdict



[docs]
def parse_alignment_xml(
    file_path: Union[str, Path],
    prefix_map: ConverterHint = None,
    meta: Optional[MetadataType] = None,
    mapping_predicates: Optional[List[str]] = None,
) -> MappingSetDataFrame:
    """Parse a TSV -> MappingSetDocument -> MappingSetDataFrame."""
    raise_for_bad_path(file_path)

    converter, meta = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta)
    logging.info("Loading from alignment API")
    xmldoc = minidom.parse(Path(file_path).resolve().as_posix())
    msdf = from_alignment_minidom(
        xmldoc,
        prefix_map=converter,
        meta=meta,
        mapping_predicates=mapping_predicates,
    )
    return msdf



# Readers (from object)



[docs]
def from_sssom_dataframe(
    df: pd.DataFrame,
    prefix_map: ConverterHint = None,
    meta: Optional[MetadataType] = None,
) -> MappingSetDataFrame:
    """Convert a dataframe to a MappingSetDataFrame.

    :param df: A mappings dataframe
    :param prefix_map: A prefix map
    :param meta: A metadata dictionary
    :return: MappingSetDataFrame
    """
    converter = ensure_converter(prefix_map)

    # Need to revisit this solution.
    # This is to address: A value is trying to be set on a copy of a slice from a DataFrame
    if CONFIDENCE in df.columns:
        df.replace({CONFIDENCE: r"^\s*$"}, np.nan, regex=True, inplace=True)

    mapping_set = _get_mapping_set_from_df(df=df, meta=meta)
    doc = MappingSetDocument(mapping_set=mapping_set, converter=converter)
    return to_mapping_set_dataframe(doc)




[docs]
def from_sssom_rdf(
    g: Graph,
    prefix_map: ConverterHint = None,
    meta: Optional[MetadataType] = None,
) -> MappingSetDataFrame:
    """Convert an SSSOM RDF graph into a SSSOM data table.

    :param g: the Graph (rdflib)
    :param prefix_map: A dictionary containing the prefix map, defaults to None
    :param meta: Potentially additional metadata, defaults to None
    :return: MappingSetDataFrame object
    """
    converter = ensure_converter(prefix_map)
    mapping_set = cast(
        MappingSet,
        RDFLibLoader().load(
            source=g,
            target_class=MappingSet,
            schemaview=_get_sssom_schema_object().view,
            prefix_map=converter.bimap,
            ignore_unmapped_predicates=True,
        ),
    )

    # The priority order for combining metadata is:
    #  1. Metadata appearing in the SSSOM document
    #  2. Metadata passed through ``meta`` to this function
    #  3. Default metadata

    # As the Metadata appearing in the SSSOM document is already parsed by LinkML
    # we only need to overwrite the metadata from 2 and 3 if it is not present
    combine_meta = dict(
        ChainMap(
            meta or {},
            get_default_metadata(),
        )
    )

    _set_metadata_in_mapping_set(mapping_set, metadata=combine_meta, overwrite=False)
    mdoc = MappingSetDocument(mapping_set=mapping_set, converter=converter)
    return to_mapping_set_dataframe(mdoc)




[docs]
def from_sssom_json(
    jsondoc: Union[str, dict, TextIO],
    prefix_map: ConverterHint = None,
    meta: Optional[MetadataType] = None,
) -> MappingSetDataFrame:
    """Load a mapping set dataframe from a JSON object.

    :param jsondoc: JSON document
    :param prefix_map: Prefix map
    :param meta: metadata used to augment the metadata existing in the mapping set
    :return: MappingSetDataFrame object
    """
    converter = ensure_converter(prefix_map)

    mapping_set = cast(MappingSet, JSONLoader().load(source=jsondoc, target_class=MappingSet))

    # The priority order for combining metadata is:
    #  1. Metadata appearing in the SSSOM document
    #  2. Metadata passed through ``meta`` to this function
    #  3. Default metadata

    # As the Metadata appearing in the SSSOM document is already parsed by LinkML
    # we only need to overwrite the metadata from 2 and 3 if it is not present
    combine_meta = dict(
        ChainMap(
            meta or {},
            get_default_metadata(),
        )
    )

    _set_metadata_in_mapping_set(mapping_set, metadata=combine_meta, overwrite=False)
    mapping_set_document = MappingSetDocument(mapping_set=mapping_set, converter=converter)
    return to_mapping_set_dataframe(mapping_set_document)




[docs]
def from_alignment_minidom(
    dom: Document,
    prefix_map: ConverterHint = None,
    meta: Optional[MetadataType] = None,
    mapping_predicates: Optional[List[str]] = None,
) -> MappingSetDataFrame:
    """Read a minidom Document object.

    :param dom: XML (minidom) object
    :param prefix_map: A prefix map
    :param meta: Optional meta data
    :param mapping_predicates: Optional list of mapping predicates to extract
    :return: MappingSetDocument
    :raises ValueError: for alignment format: xml element said, but not set to yes. Only XML is supported!
    """
    converter = ensure_converter(prefix_map)
    ms = _init_mapping_set(meta)
    mlist: List[Mapping] = []
    # bad_attrs = {}

    if not mapping_predicates:
        mapping_predicates = DEFAULT_MAPPING_PROPERTIES

    alignments = dom.getElementsByTagName("Alignment")
    for n in alignments:
        for e in n.childNodes:
            if e.nodeType == Node.ELEMENT_NODE:
                node_name = e.nodeName
                if node_name == "map":
                    cell = e.getElementsByTagName("Cell")
                    for c_node in cell:
                        mdict: Dict[str, Any] = _cell_element_values(
                            c_node, converter, mapping_predicates=mapping_predicates
                        )
                        _add_valid_mapping_to_list(mdict, mlist, flip_superclass_assertions=True)

                elif node_name == "xml":
                    if e.firstChild.nodeValue != "yes":  # type:ignore[union-attr]
                        raise ValueError(
                            "Alignment format: xml element said, but not set to yes. Only XML is supported!"
                        )
                elif node_name == "onto1":
                    ms[SUBJECT_SOURCE_ID] = e.firstChild.nodeValue  # type:ignore[union-attr]
                elif node_name == "onto2":
                    ms[OBJECT_SOURCE_ID] = e.firstChild.nodeValue  # type:ignore[union-attr]
                elif node_name == "uri1":
                    ms[SUBJECT_SOURCE] = e.firstChild.nodeValue  # type:ignore[union-attr]
                elif node_name == "uri2":
                    ms[OBJECT_SOURCE] = e.firstChild.nodeValue  # type:ignore[union-attr]

    ms.mappings = mlist  # type: ignore
    mapping_set_document = MappingSetDocument(mapping_set=ms, converter=converter)
    return to_mapping_set_dataframe(mapping_set_document)



def _get_obographs_predicate_id(obographs_predicate: str):
    if obographs_predicate == "is_a":
        return RDFS_SUBCLASS_OF
    return obographs_predicate



[docs]
def from_obographs(
    jsondoc: Dict,
    prefix_map: ConverterHint = None,
    meta: Optional[MetadataType] = None,
    mapping_predicates: Optional[List[str]] = None,
) -> MappingSetDataFrame:
    """Convert a obographs json object to an SSSOM data frame.

    :param jsondoc: The JSON object representing the ontology in obographs format
    :param prefix_map: The prefix map to be used
    :param meta: Any additional metadata that needs to be added to the resulting SSSOM data frame, defaults to None
    :param mapping_predicates: Optional list of mapping predicates to extract
    :raises Exception: When there is no CURIE
    :return: An SSSOM data frame (MappingSetDataFrame)
    """
    converter = ensure_converter(prefix_map)
    ms = _init_mapping_set(meta)
    mlist: List[Mapping] = []

    if not mapping_predicates:
        mapping_predicates = DEFAULT_MAPPING_PROPERTIES

    graphs = jsondoc.get("graphs")
    if not graphs:
        raise Exception("No graphs element in obographs file, wrong format?")

    #: A dictionary of node URIs to node labels
    labels: Mapping[str, str] = {
        node["id"]: node.get("lbl")
        for graph in graphs
        for node in graph.get("nodes", [])
        if node.get("lbl")
    }

    for graph in graphs:
        for node in graph.get("nodes", []):
            meta = node.get("meta")
            if not meta:
                continue

            node_uri = node["id"]
            if OBO_HAS_DB_XREF_URI in mapping_predicates:
                for xref in meta.get("xrefs", []):
                    mdict = _make_mdict(
                        node_uri, OBO_HAS_DB_XREF_URI, xref["val"], converter, labels
                    )
                    _add_valid_mapping_to_list(mdict, mlist)

            for value in meta.get("basicPropertyValues", []):
                predicate_uri = value["pred"]
                if predicate_uri not in mapping_predicates:
                    continue
                mdict = _make_mdict(node_uri, predicate_uri, value["val"], converter, labels)
                _add_valid_mapping_to_list(mdict, mlist)

        for edge in graph.get("edges", []):
            predicate_uri = _get_obographs_predicate_id(edge["pred"])
            if predicate_uri not in mapping_predicates:
                continue
            mdict = _make_mdict(edge["sub"], predicate_uri, edge["obj"], converter, labels)
            _add_valid_mapping_to_list(mdict, mlist)

        if OWL_EQUIV_CLASS_URI in mapping_predicates:
            for equivalents in graph.get("equivalentNodesSets", []):
                node_uris = equivalents.get("nodeIds")
                if not node_uris:
                    continue
                for subject_uri, object_uri in itt.product(node_uris, repeat=2):
                    if subject_uri == object_uri:
                        continue
                    mdict = _make_mdict(
                        subject_uri, OWL_EQUIV_CLASS_URI, object_uri, converter, labels
                    )
                    _add_valid_mapping_to_list(mdict, mlist)

    ms.mappings = mlist  # type: ignore
    mdoc = MappingSetDocument(mapping_set=ms, converter=converter)
    return to_mapping_set_dataframe(mdoc)



def _make_mdict(
    subject_id: str,
    predicate_id: str,
    object_id: str,
    converter: Converter,
    labels: typing.Mapping[str, str],
):
    mdict = {
        MAPPING_JUSTIFICATION: MAPPING_JUSTIFICATION_UNSPECIFIED,
    }
    try:
        subject_curie = safe_compress(subject_id, converter)
    except ValueError as e:
        logging.debug("could not parse subject %s - %s", subject_id, e)
    else:
        mdict[SUBJECT_ID] = subject_curie

    try:
        predicate_curie = safe_compress(predicate_id, converter)
    except ValueError as e:
        logging.debug("could not parse predicate %s - %s", predicate_id, e)
    else:
        mdict[PREDICATE_ID] = predicate_curie

    try:
        object_curie = safe_compress(object_id, converter)
    except ValueError as e:
        logging.debug("could not parse object %s - %s", object_id, e)
    else:
        mdict[OBJECT_ID] = object_curie

    if subject_id in labels:
        mdict[SUBJECT_LABEL] = labels[subject_id]
    if object_id in labels:
        mdict[OBJECT_LABEL] = labels[object_id]
    return mdict


# All from_* take as an input a python object (data frame, json, etc.) and return a MappingSetDataFrame
# All read_* take as an input a file handle and return a MappingSetDataFrame (usually wrapping a from_* method)


PARSING_FUNCTIONS: typing.Mapping[str, Callable] = {
    "csv": parse_sssom_table,
    "tsv": parse_sssom_table,
    "obographs-json": parse_obographs_json,
    "alignment-api-xml": parse_alignment_xml,
    "json": parse_sssom_json,
    "rdf": parse_sssom_rdf,
}



[docs]
def get_parsing_function(input_format: Optional[str], filename: str) -> Callable:
    """Return appropriate parser function based on input format of file.

    :param input_format: File format
    :param filename: Filename
    :raises ValueError: Unknown file format
    :return: Appropriate 'read' function
    """
    if input_format is None:
        input_format = get_file_extension(filename) or "tsv"
    func = PARSING_FUNCTIONS.get(input_format)
    if func is None:
        raise ValueError(f"Unknown input format: {input_format}")
    return func



def _flip_superclass_assertion(mapping: Mapping) -> Mapping:
    if mapping.predicate_id != "sssom:superClassOf":
        return mapping
    mapping.predicate_id = "rdfs:subClassOf"
    return _swap_object_subject(mapping)


def _swap_object_subject(mapping: Mapping) -> Mapping:
    members = [
        attr.replace("subject_", "")
        for attr in dir(mapping)
        if not callable(getattr(mapping, attr))
        and not attr.startswith("__")
        and attr.startswith("subject_")
    ]
    for var in members:
        subject_val = getattr(mapping, "subject_" + var)
        object_val = getattr(mapping, "object_" + var)
        setattr(mapping, "subject_" + var, object_val)
        setattr(mapping, "object_" + var, subject_val)
    return mapping


def _read_metadata_from_table(stream: io.StringIO) -> Dict[str, Any]:
    yamlstr = ""
    for line in stream:
        if line.startswith("#"):
            yamlstr += re.sub("^#", "", line)
        else:
            break

    if yamlstr:
        meta = yaml.safe_load(yamlstr)
        logging.info(f"Meta={meta}")
        return meta
    return {}


def _set_metadata_in_mapping_set(
    mapping_set: MappingSet, metadata: Optional[MetadataType] = None, overwrite: bool = True
) -> None:
    if metadata is None:
        logging.info("Tried setting metadata but none provided.")
    else:
        for k, v in metadata.items():
            if k != CURIE_MAP:
                if (
                    hasattr(mapping_set, k)
                    and getattr(mapping_set, k) is not None
                    and not overwrite
                ):
                    continue
                mapping_set[k] = _address_multivalued_slot(k, v)


def _cell_element_values(cell_node, converter: Converter, mapping_predicates) -> Dict[str, Any]:
    mdict: Dict[str, Any] = {}
    for child in cell_node.childNodes:
        if child.nodeType == Node.ELEMENT_NODE:
            try:
                if child.nodeName == "entity1":
                    mdict[SUBJECT_ID] = safe_compress(child.getAttribute("rdf:resource"), converter)
                elif child.nodeName == "entity2":
                    mdict[OBJECT_ID] = safe_compress(child.getAttribute("rdf:resource"), converter)
                elif child.nodeName == "measure":
                    mdict[CONFIDENCE] = child.firstChild.nodeValue
                elif child.nodeName == "relation":
                    relation = child.firstChild.nodeValue
                    if (relation == "=") and (SKOS_EXACT_MATCH_URI in mapping_predicates):
                        mdict[PREDICATE_ID] = SKOS_EXACT_MATCH
                    elif (relation == "<") and (SKOS_BROAD_MATCH_URI in mapping_predicates):
                        mdict[PREDICATE_ID] = SKOS_BROAD_MATCH
                    elif (relation == ">") and (SKOS_NARROW_MATCH_URI in mapping_predicates):
                        mdict[PREDICATE_ID] = SKOS_NARROW_MATCH
                    # elif (relation == "%") and (SOMETHING in mapping_predicates)
                    #     # Incompatible.
                    #     pass
                    # elif (relation == "HasInstance") and (SOMETHING in mapping_predicates):
                    #     pass
                    elif (relation == "InstanceOf") and (RDF_TYPE_URI in mapping_predicates):
                        mdict[PREDICATE_ID] = RDF_TYPE
                    else:
                        logging.warning(f"{relation} not a recognised relation type.")
                else:
                    logging.warning(f"Unsupported alignment api element: {child.nodeName}")
            except ValueError as e:
                logging.warning(e)

    mdict[MAPPING_JUSTIFICATION] = MAPPING_JUSTIFICATION_UNSPECIFIED
    return mdict


# The following methods dont really belong in the parser package..



[docs]
def to_mapping_set_document(msdf: MappingSetDataFrame) -> MappingSetDocument:
    """Convert a MappingSetDataFrame to a MappingSetDocument."""
    ms = _get_mapping_set_from_df(df=msdf.df, meta=msdf.metadata)
    return MappingSetDocument(mapping_set=ms, converter=msdf.converter)



def _get_mapping_set_from_df(df: pd.DataFrame, meta: Optional[MetadataType] = None) -> MappingSet:
    mapping_set = _init_mapping_set(meta)
    bad_attrs: Counter = Counter()

    mapping_slots = set(_get_sssom_schema_object().mapping_slots)

    df.apply(
        lambda row: _add_valid_mapping_to_list(
            _get_mapping_dict(row, bad_attrs, mapping_slots), mapping_set.mappings
        ),
        axis=1,
    )

    for k, v in bad_attrs.items():
        logging.warning(f"No attr for {k} [{v} instances]")
    return mapping_set



[docs]
def split_dataframe(
    msdf: MappingSetDataFrame,
) -> Dict[str, MappingSetDataFrame]:
    """Group the mapping set dataframe into several subdataframes by prefix.

    :param msdf: MappingSetDataFrame object
    :raises RuntimeError: DataFrame object within MappingSetDataFrame is None
    :return: Mapping object
    """
    subject_prefixes = set(msdf.df[SUBJECT_ID].str.split(":", n=1, expand=True)[0])
    object_prefixes = set(msdf.df[OBJECT_ID].str.split(":", n=1, expand=True)[0])
    relations = set(msdf.df[PREDICATE_ID])
    return split_dataframe_by_prefix(
        msdf=msdf,
        subject_prefixes=subject_prefixes,
        object_prefixes=object_prefixes,
        relations=relations,
    )




[docs]
def split_dataframe_by_prefix(
    msdf: MappingSetDataFrame,
    subject_prefixes: Iterable[str],
    object_prefixes: Iterable[str],
    relations: Iterable[str],
) -> Dict[str, MappingSetDataFrame]:
    """Split a mapping set dataframe by prefix.

    :param msdf: An SSSOM MappingSetDataFrame
    :param subject_prefixes: a list of prefixes pertaining to the subject
    :param object_prefixes: a list of prefixes pertaining to the object
    :param relations: a list of relations of interest
    :return: a dict of SSSOM data frame names to MappingSetDataFrame
    """
    df = msdf.df
    meta = msdf.metadata
    split_to_msdf: Dict[str, MappingSetDataFrame] = {}
    for subject_prefix, object_prefix, relation in itt.product(
        subject_prefixes, object_prefixes, relations
    ):
        relation_prefix, relation_id = relation.split(":")
        split = f"{subject_prefix.lower()}_{relation_id.lower()}_{object_prefix.lower()}"
        if subject_prefix not in msdf.converter.bimap:
            logging.warning(f"{split} - missing subject prefix - {subject_prefix}")
            continue
        if object_prefix not in msdf.converter.bimap:
            logging.warning(f"{split} - missing object prefix - {object_prefix}")
            continue
        df_subset = df[
            (df[SUBJECT_ID].str.startswith(subject_prefix + ":"))
            & (df[PREDICATE_ID] == relation)
            & (df[OBJECT_ID].str.startswith(object_prefix + ":"))
        ]
        if 0 == len(df_subset):
            logging.debug(f"No matches ({len(df_subset)} matches found)")
            continue
        subconverter = msdf.converter.get_subconverter(
            [subject_prefix, object_prefix, relation_prefix]
        )
        split_to_msdf[split] = from_sssom_dataframe(
            df_subset, prefix_map=dict(subconverter.bimap), meta=meta
        )
    return split_to_msdf



def _ensure_valid_mapping_from_dict(mdict: Dict[str, Any]):
    """
    Return a valid mapping object if it can be constructed, else None.

    :param mdict: A dictionary containing the mapping metadata.
    :return: A valid Mapping object, or None.
    """
    mdict.setdefault(MAPPING_JUSTIFICATION, MAPPING_JUSTIFICATION_UNSPECIFIED)

    try:
        m = Mapping(**mdict)
        if m.subject_type == "rdfs literal":
            if m.subject_label is None:
                raise ValueError("Missing subject_label")
        elif m.subject_id is None:
            raise ValueError("Missing subject_id")
        if m.object_type == "rdfs literal":
            if m.object_label is None:
                raise ValueError("Missing object_label")
        elif m.object_id is None:
            raise ValueError("Missing object_id")
    except ValueError as e:
        logging.warning(
            f"One mapping in the mapping set is not well-formed, "
            f"and therfore not included in the mapping set ({mdict}). Error: {e}"
        )
        return None
    else:
        return m


def _add_valid_mapping_to_list(
    mdict: Dict[str, Any], mlist: List[Mapping], *, flip_superclass_assertions=False
):
    """
    Validate the mapping and append to the list if valid.

    Parameters:
    - mdict (dict): A dictionary containing the mapping metadata.
    - mlist (list): The list to which the valid mapping should be appended.
    - flip_superclass_assertions (bool): an optional paramter that flips sssom:superClassOf to rdfs:subClassOf
    """
    mapping = _ensure_valid_mapping_from_dict(mdict)
    if not mapping:
        return None
    if flip_superclass_assertions:
        mapping = _flip_superclass_assertion(mapping)
    mlist.append(mapping)