Skip to content

I/O

High-level I/O operations used by the CLI commands.

sssom.io

I/O utilities for SSSOM.

convert_file(input_path, output, output_format=None, propagate=True, condense=True)

Convert a file from one format to another.

:param input_path: The path to the input SSSOM tsv file :param output: The path to the output file. If none is given, will default to using stdout. :param output_format: The format to which the SSSOM TSV should be converted. :param propagate: Propagate condensed slots in the input file. :param condense: Condense slots in the output file.

Source code in src/sssom/io.py
def convert_file(
    input_path: str,
    output: TextIO,
    output_format: Optional[str] = None,
    propagate: bool = True,
    condense: bool = True,
) -> None:
    """Convert a file from one format to another.

    :param input_path: The path to the input SSSOM tsv file
    :param output: The path to the output file. If none is given, will default to using stdout.
    :param output_format: The format to which the SSSOM TSV should be converted.
    :param propagate: Propagate condensed slots in the input file.
    :param condense: Condense slots in the output file.
    """
    raise_for_bad_path(input_path)
    doc = parse_sssom_table(input_path, propagate=propagate)
    write_func, fileformat = get_writer_function(output_format=output_format, output=output)
    # TODO cthoyt figure out how to use protocols for this
    write_func(doc, output, serialisation=fileformat, condense=condense)  # type: ignore

parse_file(input_path, output, *, input_format=None, metadata_path=None, prefix_map_mode=None, clean_prefixes=True, strict_clean_prefixes=True, embedded_mode=True, mapping_predicate_filter=None, propagate=True, condense=True)

Parse an SSSOM metadata file and write to a table.

:param input_path: The path to the input file in one of the legal formats, eg obographs, aligmentapi-xml :param output: The path to the output file. :param input_format: The string denoting the input format. :param metadata_path: The path to a file containing the sssom metadata (including prefix_map) to be used during parse. :param prefix_map_mode: Defines whether the prefix map in the metadata should be extended or replaced with the SSSOM default prefix map derived from the :mod:bioregistry. :param clean_prefixes: If True (default), records with unknown prefixes are removed from the SSSOM file. :param strict_clean_prefixes: If True (default), clean_prefixes() will be in strict mode. :param embedded_mode: If True (default), the dataframe and metadata are exported in one file (tsv), else two separate files (tsv and yaml). :param mapping_predicate_filter: Optional list of mapping predicates or filepath containing the same. :param propagate: If true, propagate all condensed slots in the input set. :param condense: If true, condense slots in the output set.

Source code in src/sssom/io.py
def parse_file(
    input_path: str,
    output: TextIO,
    *,
    input_format: Optional[str] = None,
    metadata_path: Optional[str] = None,
    prefix_map_mode: Optional[MergeMode] = None,
    clean_prefixes: bool = True,
    strict_clean_prefixes: bool = True,
    embedded_mode: bool = True,
    mapping_predicate_filter: RecursivePathList | None = None,
    propagate: bool = True,
    condense: bool = True,
) -> None:
    """Parse an SSSOM metadata file and write to a table.

    :param input_path: The path to the input file in one of the legal formats, eg obographs,
        aligmentapi-xml
    :param output: The path to the output file.
    :param input_format: The string denoting the input format.
    :param metadata_path: The path to a file containing the sssom metadata (including prefix_map) to
        be used during parse.
    :param prefix_map_mode: Defines whether the prefix map in the metadata should be extended or
        replaced with the SSSOM default prefix map derived from the :mod:`bioregistry`.
    :param clean_prefixes: If True (default), records with unknown prefixes are removed from the
        SSSOM file.
    :param strict_clean_prefixes: If True (default), clean_prefixes() will be in strict mode.
    :param embedded_mode: If True (default), the dataframe and metadata are exported in one file
        (tsv), else two separate files (tsv and yaml).
    :param mapping_predicate_filter: Optional list of mapping predicates or filepath containing the
        same.
    :param propagate: If true, propagate all condensed slots in the input set.
    :param condense: If true, condense slots in the output set.
    """
    raise_for_bad_path(input_path)
    converter, meta = _get_converter_and_metadata(
        metadata_path=metadata_path, prefix_map_mode=prefix_map_mode
    )
    parse_func = get_parsing_function(input_format, input_path)
    mapping_predicates = None
    # Get list of predicates of interest.
    if mapping_predicate_filter:
        mapping_predicates = extract_iris(mapping_predicate_filter, converter)

    doc = parse_func(
        input_path,
        prefix_map=converter,
        meta=meta,
        mapping_predicates=mapping_predicates,
        propagate=propagate,
    )
    if clean_prefixes:
        # We do this because we got a lot of prefixes from the default SSSOM prefixes!
        doc.clean_prefix_map(strict=strict_clean_prefixes)
    write_table(doc, output, embedded_mode, condense=condense)

validate_file(input_path, validation_types=None, fail_on_error=True, propagate=True)

Validate the incoming SSSOM TSV according to the SSSOM specification.

:param input_path: The path to the input file in one of the legal formats, eg obographs, aligmentapi-xml :param validation_types: A list of validation types to run. :param fail_on_error: Should an exception be raised on error of any validator? :param propagate: If true, propagate condensed slots in the input set.

:returns: A dictionary from validation types to validation reports

Source code in src/sssom/io.py
def validate_file(
    input_path: str,
    validation_types: Optional[List[SchemaValidationType]] = None,
    fail_on_error: bool = True,
    propagate: bool = True,
) -> dict[SchemaValidationType, ValidationReport]:
    """Validate the incoming SSSOM TSV according to the SSSOM specification.

    :param input_path: The path to the input file in one of the legal formats, eg obographs,
        aligmentapi-xml
    :param validation_types: A list of validation types to run.
    :param fail_on_error: Should an exception be raised on error of _any_ validator?
    :param propagate: If true, propagate condensed slots in the input set.

    :returns: A dictionary from validation types to validation reports
    """
    # Two things to check:
    # 1. All prefixes in the DataFrame are define in prefix_map
    # 2. All columns in the DataFrame abide by sssom-schema.
    msdf = parse_sssom_table(file_path=input_path, propagate=propagate)
    return validate(msdf=msdf, validation_types=validation_types, fail_on_error=fail_on_error)

split_file(input_path, output_directory, *, method=None)

Split an SSSOM TSV by prefixes and relations.

:param input_path: The path to the input file in one of the legal formats, eg obographs, aligmentapi-xml :param output_directory: The directory to which the split file should be exported.

Source code in src/sssom/io.py
def split_file(
    input_path: str, output_directory: Union[str, Path], *, method: SplitMethod | None = None
) -> None:
    """Split an SSSOM TSV by prefixes and relations.

    :param input_path: The path to the input file in one of the legal formats, eg obographs,
        aligmentapi-xml
    :param output_directory: The directory to which the split file should be exported.
    """
    raise_for_bad_path(input_path)
    msdf = parse_sssom_table(input_path)
    splitted = split_dataframe(msdf, method=method)
    write_tables(splitted, output_directory)

get_metadata_and_prefix_map(metadata_path=None, *, prefix_map_mode=None)

Load metadata and a prefix map in a deprecated way.

Source code in src/sssom/io.py
@deprecated(  # type: ignore[untyped-decorator]
    deprecated_in="0.4.3",
    details="This functionality for loading SSSOM metadata from a YAML file is deprecated from the "
    "public API since it has internal assumptions which are usually not valid for downstream users.",
)
def get_metadata_and_prefix_map(
    metadata_path: Union[None, str, Path] = None, *, prefix_map_mode: Optional[MergeMode] = None
) -> Tuple[Converter, MetadataType]:
    """Load metadata and a prefix map in a deprecated way."""
    return _get_converter_and_metadata(metadata_path=metadata_path, prefix_map_mode=prefix_map_mode)

extract_iris(input, converter)

Recursively extracts a list of IRIs from a string or file.

:param input: CURIE OR list of CURIEs OR file path containing the same. :param converter: Prefix map of mapping set (possibly) containing custom prefix:IRI combination.

:returns: A list of IRIs.

Source code in src/sssom/io.py
def extract_iris(input: RecursivePathList, converter: Converter) -> List[str]:
    """Recursively extracts a list of IRIs from a string or file.

    :param input: CURIE OR list of CURIEs OR file path containing the same.
    :param converter: Prefix map of mapping set (possibly) containing custom prefix:IRI combination.

    :returns: A list of IRIs.
    """
    if isinstance(input, (str, Path)) and os.path.isfile(input):
        pred_list = Path(input).read_text().splitlines()
        return sorted(set(chain.from_iterable(extract_iris(p, converter) for p in pred_list)))
    if isinstance(input, list):
        return sorted(set(chain.from_iterable(extract_iris(p, converter) for p in input)))
    if isinstance(input, tuple):
        return sorted(set(chain.from_iterable(extract_iris(p, converter) for p in input)))
    if not isinstance(input, str):
        raise TypeError
    if converter.is_uri(input):
        return [converter.standardize_uri(input, strict=True)]
    if converter.is_curie(input):
        return [converter.expand(input, strict=True)]
    logging.warning(
        f"{input} is neither a local file path nor a valid CURIE or URI w.r.t. the given converter. "
        f"skipped from processing."
    )
    return []

run_sql_query(query, inputs, output=None)

Run a SQL query over one or more SSSOM files.

Each of the N inputs is assigned a table name df1, df2, ..., dfN

Alternatively, the filenames can be used as table names - these are first stemmed E.g. ~/dir/my.sssom.tsv becomes a table called 'my'

Example: sssom dosql -Q "SELECT * FROM df1 WHERE confidence>0.5 ORDER BY confidence" my.sssom.tsv

Example: sssom dosql -Q "SELECT file1.*,file2.object_id AS ext_object_id, file2.object_label AS ext_object_label FROM file1 INNER JOIN file2 WHERE file1.object_id = file2.subject_id" FROM file1.sssom.tsv file2.sssom.tsv

:param query: Query to be executed over a pandas DataFrame (msdf.df). :param inputs: Input files that form the source tables for query. :param output: Output.

:returns: Filtered MappingSetDataFrame object.

Source code in src/sssom/io.py
def run_sql_query(
    query: str, inputs: List[str], output: Optional[TextIO] = None
) -> MappingSetDataFrame:
    """Run a SQL query over one or more SSSOM files.

    Each of the N inputs is assigned a table name df1, df2, ..., dfN

    Alternatively, the filenames can be used as table names - these are first stemmed E.g.
    ~/dir/my.sssom.tsv becomes a table called 'my'

    Example:
        sssom dosql -Q "SELECT * FROM df1 WHERE confidence>0.5 ORDER BY confidence" my.sssom.tsv

    Example:
        `sssom dosql -Q "SELECT file1.*,file2.object_id AS ext_object_id, file2.object_label AS
        ext_object_label FROM file1 INNER JOIN file2 WHERE file1.object_id = file2.subject_id" FROM
        file1.sssom.tsv file2.sssom.tsv`

    :param query: Query to be executed over a pandas DataFrame (msdf.df).
    :param inputs: Input files that form the source tables for query.
    :param output: Output.

    :returns: Filtered MappingSetDataFrame object.
    """
    from pansql import sqldf

    n = 1
    while len(inputs) >= n:
        fn = inputs[n - 1]
        msdf = parse_sssom_table(fn)
        df = msdf.df
        # df = parse(fn)
        globals()[f"df{n}"] = df
        tn = re.sub("[.].*", "", Path(fn).stem).lower()
        globals()[tn] = df
        n += 1

    new_df = sqldf(query)

    msdf.clean_context()
    new_msdf = MappingSetDataFrame.with_converter(
        df=new_df, converter=msdf.converter, metadata=msdf.metadata
    )
    if output is not None:
        write_table(new_msdf, output)
    return new_msdf

filter_file(input, output=None, **kwargs)

Filter a dataframe by dynamically generating queries based on user input.

e.g. sssom filter --subject_id x:% --subject_id y:% --object_id y:% --object_id z:% tests/data/basic.tsv

yields the query:

"SELECT * FROM df WHERE (subject_id LIKE 'x:%' OR subject_id LIKE 'y:%') AND (object_id LIKE 'y:%' OR object_id LIKE 'z:%') " and displays the output.

:param input: DataFrame to be queried over. :param output: Output location. :param kwargs: Filter options provided by user which generate queries (e.g.: --subject_id x:%).

:returns: Filtered MappingSetDataFrame object.

:raises ValueError: If parameter provided is invalid.

Source code in src/sssom/io.py
def filter_file(input: str, output: Optional[TextIO] = None, **kwargs: Any) -> MappingSetDataFrame:
    """Filter a dataframe by dynamically generating queries based on user input.

    e.g. sssom filter --subject_id x:% --subject_id y:% --object_id y:% --object_id z:%
    tests/data/basic.tsv

    yields the query:

    "SELECT * FROM df WHERE (subject_id LIKE 'x:%' OR subject_id LIKE 'y:%')
        AND (object_id LIKE 'y:%' OR object_id LIKE 'z:%') " and displays the output.

    :param input: DataFrame to be queried over.
    :param output: Output location.
    :param kwargs: Filter options provided by user which generate queries (e.g.: --subject_id x:%).

    :returns: Filtered MappingSetDataFrame object.

    :raises ValueError: If parameter provided is invalid.
    """
    params = {k: v for k, v in kwargs.items() if v}
    query = "SELECT * FROM df WHERE ("
    multiple_params = True if len(params) > 1 else False

    # Check if all params are legit
    input_df: pd.DataFrame = parse_sssom_table(input).df
    if not input_df.empty and len(input_df.columns) > 0:
        column_list = list(input_df.columns)
    else:
        raise ValueError(f"{input} is either not a SSSOM TSV file or an empty one.")
    legit_params = all(p in column_list for p in params)
    if not legit_params:
        invalids = [p for p in params if p not in column_list]
        raise ValueError(f"The params are invalid: {invalids}")

    for idx, (k, v) in enumerate(params.items(), start=1):
        query += k + " LIKE '" + v[0] + "' "
        if len(v) > 1:
            for idx2, exp in enumerate(v[1:]):
                query += " OR "
                query += k + " LIKE '" + exp + "'"
                if idx2 + 1 == len(v) - 1:
                    query += ") "
        else:
            query += ") "
        if multiple_params and idx != len(params):
            query += " AND ("
    return run_sql_query(query=query, inputs=[input], output=output)

annotate_file(input, output=None, replace_multivalued=False, **kwargs)

Annotate a file i.e. add custom metadata to the mapping set.

:param input: SSSOM tsv file to be queried over. :param output: Output location. :param replace_multivalued: Multivalued slots should be replaced or not, defaults to False :param kwargs: Options provided by user which are added to the metadata (e.g. --mapping_set_id http://example.org/abcd)

:returns: Annotated MappingSetDataFrame object.

Source code in src/sssom/io.py
def annotate_file(
    input: str, output: Optional[TextIO] = None, replace_multivalued: bool = False, **kwargs: Any
) -> MappingSetDataFrame:
    """Annotate a file i.e. add custom metadata to the mapping set.

    :param input: SSSOM tsv file to be queried over.
    :param output: Output location.
    :param replace_multivalued: Multivalued slots should be replaced or not, defaults to False
    :param kwargs: Options provided by user which are added to the metadata (e.g. ``--mapping_set_id
        http://example.org/abcd``)

    :returns: Annotated MappingSetDataFrame object.
    """
    params = {k: v for k, v in kwargs.items() if v}
    are_params_slots(params)
    input_msdf = parse_sssom_table(input)
    msdf = augment_metadata(input_msdf, params, replace_multivalued)
    if output is not None:
        write_table(msdf, output)
    return msdf