Source code for gilda.api

__all__ = [
    'ground',
    'get_models',
    'get_names',
    'get_grounder',
    'make_grounder',
    'annotate',
]

from typing import List, Mapping, Union, Optional

from gilda.grounder import Grounder
from gilda.term import Term


class GrounderInstance(object):
    def __init__(self):
        self.grounder = None

    def get_grounder(self):
        if self.grounder is None:
            self.grounder = Grounder()
        return self.grounder

    def ground(self, text, context=None, organisms=None,
               namespaces=None):
        return self.get_grounder().ground(text, context=context,
                                          organisms=organisms,
                                          namespaces=namespaces)

    def get_models(self):
        return self.get_grounder().get_models()

    def get_names(self, db, id, status=None, source=None):
        return self.get_grounder().get_names(db, id,
                                             status=status,
                                             source=source)

    @property
    def prefix_index(self):
        return self.get_grounder().prefix_index


grounder = GrounderInstance()


[docs]def ground(text, context=None, organisms=None, namespaces=None): """Return a list of scored matches for a text to ground. Parameters ---------- text : str The entity text to be grounded. context : Optional[str] Any additional text that serves as context for disambiguating the given entity text, used if a model exists for disambiguating the given text. organisms : Optional[List[str]] A list of taxonomy identifiers to use as a priority list when surfacing matches for proteins/genes from multiple organisms. namespaces : Optional[List[str]] A list of namespaces to restrict the matches to. By default, no restriction is applied. Returns ------- list[gilda.grounder.ScoredMatch] A list of ScoredMatch objects representing the groundings. Examples -------- Ground a string corresponding to an entity name, label, or synonym >>> import gilda >>> scored_matches = gilda.ground('mapt') The matches are sorted in descending order by score, and in the event of a tie, by the namespace of the primary grounding. Each scored match has a :class:`gilda.term.Term` object that contain information about the primary grounding. >>> scored_matches[0].term.db 'hgnc' >>> scored_matches[0].term.id '6893' >>> scored_matches[0].term.get_curie() 'hgnc:6893' The score for each match can be accessed directly: >>> scored_matches[0].score 0.7623 The rationale for each match is contained in the ``match`` attribute whose fields are described in :class:`gilda.scorer.Match`: >>> match_object = scored_matches[0].match Give optional context to be used by Gilda's disambiguation models, if available >>> scored_matches = gilda.ground('ER', context='Calcium is released from the ER.') Only return results from a certain namespace, such as when a family and gene have the same name >>> scored_matches = gilda.ground('ESR', namespaces=["hgnc"]) """ return grounder.ground(text=text, context=context, organisms=organisms, namespaces=namespaces)
[docs]def annotate( text: str, sent_split_fun=None, organisms=None, namespaces=None, return_first: bool = True, ): """Annotate a given text with Gilda (i.e., do named entity recognition). Parameters ---------- text : str The text to be annotated. sent_split_fun : Callable, optional A function that splits the text into sentences. The default is :func:`nltk.tokenize.sent_tokenize`. The function should take a string as input and return an iterable of strings corresponding to the sentences in the input text. organisms : list[str], optional A list of organism names to pass to the grounder. If not provided, human is used. namespaces : list[str], optional A list of namespaces to pass to the grounder to restrict the matches to. By default, no restriction is applied. return_first: If true, only returns the first result. Otherwise, returns all results. Returns ------- list[tuple[str, ScoredMatch, int, int]] A list of tuples of start and end character offsets of the text corresponding to the entity, the entity text, and the ScoredMatch object corresponding to the entity. """ from .ner import annotate as _annotate return _annotate( text, grounder=grounder, sent_split_fun=sent_split_fun, organisms=organisms, namespaces=namespaces, return_first=return_first, )
[docs]def get_models(): """Return a list of entity texts for which disambiguation models exist. Returns ------- list[str] The list of entity texts for which a disambiguation model is available. """ return grounder.get_models()
[docs]def get_names(db, id, status=None, source=None): """Return a list of entity texts corresponding to a given database ID. Parameters ---------- db : str The database in which the ID is an entry, e.g., HGNC. id : str The ID of an entry in the database. status : Optional[str] If given, only entity texts with the given status e.g., "synonym" are returned. source : Optional[str] If given, only entity texts from the given source e.g., "uniprot" are returned. """ return grounder.get_names(db, id, status=status, source=source)
[docs]def get_grounder() -> Grounder: """Initialize and return the default Grounder instance. Returns ------- : A Grounder instance whose attributes and methods can be used directly. """ return grounder.get_grounder()
[docs]def make_grounder( terms: Union[str, List[Term], Mapping[str, List[Term]]], ) -> Grounder: """Create a custom grounder from a list of Terms. Parameters ---------- terms : Specifies the grounding terms that should be loaded in the Grounder. If str, it is interpreted as a path to a grounding terms gzipped TSV file which is then loaded. If list, it is assumed to be a flat list of Terms. If dict, it is assumed to be a grounding terms dict with normalized entity strings as keys and lists of Term objects as values. Default: None Returns ------- : A Grounder instance, initialized with either the default terms loaded from the resource file or a custom set of terms if the terms argument was specified. Examples -------- The following example shows how to get an ontology with :mod:`obonet` and load custom terms: .. code-block:: python from gilda import make_grounder from gilda.process import normalize from gilda import Term prefix = "UBERON" url = "http://purl.obolibrary.org/obo/uberon/basic.obo" g = obonet.read_obo(url) custom_obo_terms = [] it = tqdm(g.nodes(data=True), unit_scale=True, unit="node") for node, data in it: # Skip entries imported from other ontologies if not node.startswith(f"{prefix}:"): continue identifier = node.removeprefix(f"{prefix}:") name = data["name"] custom_obo_terms.append(gilda.Term( norm_text=normalize(name), text=name, db=prefix, id=identifier, entry_name=name, status="name", source=prefix, )) # Add terms for all synonyms for synonym_raw in data.get("synonym", []): try: # Try to parse out of the quoted OBO Field synonym = synonym_raw.split('"')[1].strip() except IndexError: continue # the synonym was malformed custom_obo_terms.append(gilda.Term( norm_text=normalize(synonym), text=synonym, db=prefix, id=identifier, entry_name=name, status="synonym", source=prefix, )) custom_grounder = gilda.make_grounder(custom_obo_terms) scored_matches = custom_grounder.ground("head") Additional examples for loading custom content from OBO Graph JSON, :mod:`pyobo`, and more can be found in the `Jupyter notebooks <https://github.com/indralab/gilda/tree/master/notebooks>`_ in the Gilda repository on GitHub. """ return Grounder(terms=terms)