Source code for gilda.term

import csv
import gzip
import itertools
import logging
from typing import Iterable, Optional, Set, Tuple

__all__ = [
    "Term",
    "get_curie",
    "get_identifiers_curie",
    "get_bioregistry_url",
    "get_identifiers_url",
    "filter_out_duplicates",
    "dump_terms",
]

logger = logging.getLogger(__name__)


[docs]class Term(object):
    """Represents a text entry corresponding to a grounded term.

    Attributes
    ----------
    norm_text : str
        The normalized text corresponding to the text entry, used for lookups.
    text : str
        The text entry itself.
    db : str
        The database / name space corresponding to the grounded term.
    id : str
        The identifier of the grounded term within the database / name space.
    entry_name : str
        The standardized name corresponding to the grounded term.
    status : str
        The relationship of the text entry to the grounded term, e.g., synonym.
    source : str
        The source from which the term was obtained.
    organism : Optional[str]
        When the term represents a protein, this attribute provides the
        taxonomy code of the species for the protein.
        For non-proteins, not provided. Default: None
    source_db : Optional[str]
        If the term's db/id was mapped from a different, original db/id
        from a given source, this attribute provides the original db value
        before mapping.
    source_id : Optional[str]
        If the term's db/id was mapped from a different, original db/id
        from a given source, this attribute provides the original ID value
        before mapping.
    """

    def __init__(self, norm_text, text, db, id, entry_name, status, source,
                 organism=None, source_db=None, source_id=None):
        if not text:
            raise ValueError('Text for Term cannot be empty')
        if not norm_text.strip():
            raise ValueError('Normalized text for Term cannot be empty')
        self.norm_text = norm_text
        self.text = text
        self.db = db
        self.id = str(id)
        self.entry_name = entry_name
        self.status = status
        self.source = source
        self.organism = organism
        self.source_db = source_db
        self.source_id = source_id

    def __str__(self):
        return 'Term(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' % (
            self.norm_text, self.text, self.db, self.id, self.entry_name,
            self.status, self.source, self.organism, self.source_db,
            self.source_id)

    def __repr__(self):
        return str(self)

[docs]    def to_json(self):
        """Return the term serialized into a JSON dict."""
        js = {
            'norm_text': self.norm_text,
            'text': self.text,
            'db': self.db,
            'id': self.id,
            'entry_name': self.entry_name,
            'status': self.status,
            'source': self.source,
        }
        if self.organism:
            js['organism'] = self.organism
        if self.source_db:
            js['source_db'] = self.source_db
        if self.source_id:
            js['source_id'] = self.source_id
        return js

[docs]    def to_list(self):
        """Return the term serialized into a list of strings."""
        return [self.norm_text, self.text, self.db, self.id,
                self.entry_name, self.status, self.source,
                self.organism, self.source_db, self.source_id]

[docs]    def get_curie(self, style='bioregistry') -> str:
        """Return the compact URI for this term.

        Parameters
        ----------
        style : str, optional
            The style of CURIE to return. One of 'bioregistry' (default) or
            'identifiers'. The 'bioregistry' style corresponds to
            bioregistry.io CURIEs, aqnd the 'identifiers' style corresponds to
            identifiers.org CURIEs.

        Returns
        -------
        :
            A normalized CURIE string for this term, or None if it cannot
            be normalized.
        """
        return get_curie(self.db, self.id, style=style)

[docs]    def get_identifiers_url(self):
        """Get the full identifiers.org URL for this term."""
        return get_identifiers_url(self.db, self.id)

[docs]    def get_bioregistry_url(self):
        """Return a URL for this term that the Bioregistry can resolve.

        Returns
        -------
        :
            A Bioregistry URL string for this term, or None if it cannot
            be created.
        """
        return get_bioregistry_url(self.db, self.id)

    # Backwards compatibility for the misspelled method name
[docs]    def get_idenfiers_url(self):  # pragma: no cover - deprecated spelling
        """Return a URL for this term that Identifiers.org can resolve.

        Returns
        -------
        :
            An Identifiers.org URL string for this term, or None if it cannot
            be created.
        """
        return self.get_identifiers_url()

[docs]    def get_groundings(self) -> Set[Tuple[str, str]]:
        """Return all groundings for this term, including from a mapped source.

        Returns
        -------
        :
            A set of tuples representing the main grounding for this term,
            as well as any source grounding from which the main grounding
            was mapped.
        """
        groundings = {(self.db, self.id)}
        if self.source_db:
            groundings.add((self.source_db, self.source_id))
        return groundings

[docs]    def get_namespaces(self) -> Set[str]:
        """Return all namespaces for this term, including from a mapped source.

        Returns
        -------
        :
            A set of strings including the main namespace for this term,
            as well as any source namespace from which the main grounding
            was mapped.
        """
        namespaces = {self.db}
        if self.source_db:
            namespaces.add(self.source_db)
        return namespaces


[docs]def get_curie(db, id, style='bioregistry') -> Optional[str]:
    """Return a normalized CURIE for the given database and identifier.

    The default Gilda configuration uses INDRA's style of databases and
    identifiers. This function is a simple way to normalize these into
    CURIEs that follow the native style of bioregistry.io or
    identifiers.org.

    Parameters
    ----------
    db : str
        The database / namespace of the identifier assuming the default
        Gilda configuration.
    id : str
        The identifier, assuming the default Gilda configuration.
    style : str, optional
        The style of CURIE to return. One of 'bioregistry' (default)
        or 'identifiers'. The 'bioregistry' style corresponds to
        bioregistry.io CURIEs, aqnd the 'identifiers' style corresponds to
        identifiers.org CURIEs.

    Returns
    -------
    :
        A normalized CURIE string, or None if it cannot be normalized.
    """
    curie_pattern = '{db}:{id}'
    if db == 'UP':
        db = 'uniprot'
    id_parts = id.split(':')
    if len(id_parts) == 1:
        return curie_pattern.format(db=db.lower(), id=id)
    elif len(id_parts) == 2:
        # This is for the namespace-embedded-in-LUI case which
        # for OBO ontologies in identifiers.org is uppercased,
        # otherwise we default to lowercase consistent with Bioregistry
        db_norm = id_parts[0].upper() if style == 'identifiers' \
            else id_parts[0].lower()
        return curie_pattern.format(db=db_norm, id=id_parts[-1])


[docs]def get_identifiers_url(db, id) -> Optional[str]:
    """Return a URL for this term that Identifiers.org can resolve.

    Parameters
    ----------
    db : str
        The database / namespace of the identifier assuming the default
        Gilda configuration.
    id : str
        The identifier, assuming the default Gilda configuration.

    Returns
    -------
    :
        An Identifiers.org URL string for this term, or None if it cannot
        be created.
    """
    curie = get_curie(db, id, style='identifiers')
    if curie is not None:
        return f'https://identifiers.org/{curie}'


[docs]def get_bioregistry_url(db, id) -> Optional[str]:
    """Return a URL that the Bioregistry can resolve.

    Parameters
    ----------
    db : str
        The database / namespace of the identifier assuming the default
        Gilda configuration.
    id : str
        The identifier, assuming the default Gilda configuration.

    Returns
    -------
    :
        A Bioregistry URL string, or None if it cannot be created.
    """
    curie = get_curie(db, id, style='bioregistry')
    if curie is not None:
        return f'https://bioregistry.io/{curie}' if curie else None


[docs]def get_identifiers_curie(db, id) -> Optional[str]:
    """Get the full identifiers.org curie for a term."""
    return get_curie(db, id, style='identifiers')


def get_url(db, id):
    """Get the URL for a term based on its curie parts."""
    curie = get_curie(db, id)


def _term_key(term: Term) -> Tuple[str, str, str, str, str]:
    # We include source_id and source_db to avoid losing
    # potentially important links back to mapped source IDs
    # but we have to make sure these are strings since otherwise
    # they could be None which can't be sorted against strings
    source_db = term.source_db or ''
    source_id = term.source_id or ''
    return term.db, term.id, source_db, source_id, term.text


statuses = {'curated': 1, 'name': 2, 'synonym': 3, 'former_name': 4}


def _priority_key(term: Term) -> Tuple[int, int]:
    """
    Prioritize terms (that are pre-grouped by db/id/text) first
    based on status, and if the status is the same, give priority
    to the ones that are from primary resources
    """
    return (
        statuses[term.status],
        0 if term.db.casefold() == term.source.casefold() else 1
    )


def filter_out_duplicates(terms):
    logger.info('Filtering %d terms for uniqueness...' % len(terms))
    new_terms = []
    for _, terms in itertools.groupby(sorted(terms, key=_term_key),
                                      key=_term_key):
        terms = sorted(terms, key=_priority_key)
        new_terms.append(terms[0])
    # Re-sort the terms
    new_terms = sorted(new_terms, key=lambda x: (x.text, x.db, x.id))
    logger.info('Got %d unique terms...' % len(new_terms))
    return new_terms


TERMS_HEADER = ['norm_text', 'text', 'db', 'id', 'entry_name', 'status',
                'source', 'organism', 'source_db', 'source_id']


[docs]def dump_terms(terms: Iterable[Term], fname) -> None:
    """Dump a list of terms to a tsv.gz file."""
    logger.info('Dumping into %s', fname)
    with gzip.open(fname, 'wt', encoding='utf-8') as fh:
        writer = csv.writer(fh, delimiter='\t')
        writer.writerow(TERMS_HEADER)
        writer.writerows(t.to_list() for t in terms)