Source code for gilda.term

import csv
import gzip
import itertools
import logging
from typing import Iterable, Optional, Set, Tuple

__all__ = [
    "Term",
    "get_identifiers_curie",
    "get_identifiers_url",
    "filter_out_duplicates",
    "dump_terms",
]

logger = logging.getLogger(__name__)


[docs]class Term(object): """Represents a text entry corresponding to a grounded term. Attributes ---------- norm_text : str The normalized text corresponding to the text entry, used for lookups. text : str The text entry itself. db : str The database / name space corresponding to the grounded term. id : str The identifier of the grounded term within the database / name space. entry_name : str The standardized name corresponding to the grounded term. status : str The relationship of the text entry to the grounded term, e.g., synonym. source : str The source from which the term was obtained. organism : Optional[str] When the term represents a protein, this attribute provides the taxonomy code of the species for the protein. For non-proteins, not provided. Default: None source_db : Optional[str] If the term's db/id was mapped from a different, original db/id from a given source, this attribute provides the original db value before mapping. source_id : Optional[str] If the term's db/id was mapped from a different, original db/id from a given source, this attribute provides the original ID value before mapping. """ def __init__(self, norm_text, text, db, id, entry_name, status, source, organism=None, source_db=None, source_id=None): if not text: raise ValueError('Text for Term cannot be empty') if not norm_text.strip(): raise ValueError('Normalized text for Term cannot be empty') self.norm_text = norm_text self.text = text self.db = db self.id = str(id) self.entry_name = entry_name self.status = status self.source = source self.organism = organism self.source_db = source_db self.source_id = source_id def __str__(self): return 'Term(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' % ( self.norm_text, self.text, self.db, self.id, self.entry_name, self.status, self.source, self.organism, self.source_db, self.source_id) def __repr__(self): return str(self)
[docs] def to_json(self): """Return the term serialized into a JSON dict.""" js = { 'norm_text': self.norm_text, 'text': self.text, 'db': self.db, 'id': self.id, 'entry_name': self.entry_name, 'status': self.status, 'source': self.source, } if self.organism: js['organism'] = self.organism if self.source_db: js['source_db'] = self.source_db if self.source_id: js['source_id'] = self.source_id return js
[docs] def to_list(self): """Return the term serialized into a list of strings.""" return [self.norm_text, self.text, self.db, self.id, self.entry_name, self.status, self.source, self.organism, self.source_db, self.source_id]
[docs] def get_curie(self) -> str: """Get the compact URI for this term.""" return get_identifiers_curie(self.db, self.id)
def get_idenfiers_url(self): return get_identifiers_url(self.db, self.id)
[docs] def get_groundings(self) -> Set[Tuple[str, str]]: """Return all groundings for this term, including from a mapped source. Returns ------- : A set of tuples representing the main grounding for this term, as well as any source grounding from which the main grounding was mapped. """ groundings = {(self.db, self.id)} if self.source_db: groundings.add((self.source_db, self.source_id)) return groundings
[docs] def get_namespaces(self) -> Set[str]: """Return all namespaces for this term, including from a mapped source. Returns ------- : A set of strings including the main namespace for this term, as well as any source namespace from which the main grounding was mapped. """ namespaces = {self.db} if self.source_db: namespaces.add(self.source_db) return namespaces
def get_identifiers_curie(db, id) -> Optional[str]: curie_pattern = '{db}:{id}' if db == 'UP': db = 'uniprot' id_parts = id.split(':') if len(id_parts) == 1: return curie_pattern.format(db=db.lower(), id=id) elif len(id_parts) == 2: return curie_pattern.format(db=id_parts[0].upper(), id=id_parts[-1]) def get_identifiers_url(db, id): curie = get_identifiers_curie(db, id) if curie is not None: return f'https://identifiers.org/{curie}' def _term_key(term: Term) -> Tuple[str, str, str]: return term.db, term.id, term.text statuses = {'curated': 1, 'name': 2, 'synonym': 3, 'former_name': 4} def _priority_key(term: Term) -> Tuple[int, int]: """ Prioritize terms (that are pre-grouped by db/id/text) first based on status, and if the status is the same, give priority to the ones that are from primary resources """ return ( statuses[term.status], 0 if term.db.casefold() == term.source.casefold() else 1 ) def filter_out_duplicates(terms): logger.info('Filtering %d terms for uniqueness...' % len(terms)) new_terms = [] for _, terms in itertools.groupby(sorted(terms, key=_term_key), key=_term_key): terms = sorted(terms, key=_priority_key) new_terms.append(terms[0]) # Re-sort the terms new_terms = sorted(new_terms, key=lambda x: (x.text, x.db, x.id)) logger.info('Got %d unique terms...' % len(new_terms)) return new_terms TERMS_HEADER = ['norm_text', 'text', 'db', 'id', 'entry_name', 'status', 'source', 'organism', 'source_db', 'source_id']
[docs]def dump_terms(terms: Iterable[Term], fname) -> None: """Dump a list of terms to a tsv.gz file.""" logger.info('Dumping into %s', fname) with gzip.open(fname, 'wt', encoding='utf-8') as fh: writer = csv.writer(fh, delimiter='\t') writer.writerow(TERMS_HEADER) writer.writerows(t.to_list() for t in terms)