Source code for gilda.ner

"""
Gilda implements a simple dictionary-based named entity
recognition (NER) algorithm. It can be used as follows:

>>> from gilda.ner import annotate
>>> text = "MEK phosphorylates ERK"
>>> results = annotate(text)

The results are a list of Annotation objects each of which contains:

- the `text` string matched
- a list of :class:`gilda.grounder.ScoredMatch` instances containing a sorted list of matches
  for the given text span (first one is the best match)
- the `start` position in the text string where the entity starts
- the `end` position in the text string where the entity ends


In this example, the two concepts are grounded to FamPlex entries.

>>> results[0].text, results[0].matches[0].term.get_curie(), results[0].start, results[0].end
('MEK', 'fplx:MEK', 0, 3)
>>> results[1].text, results[1].matches[0].term.get_curie(), results[1].start, results[1].end
('ERK', 'fplx:ERK', 19, 22)

If you directly look in the second part of the 4-tuple, you get a full
description of the match itself:

>>> results[0].matches[0]
ScoredMatch(Term(mek,MEK,FPLX,MEK,MEK,curated,famplex,None,None,None),\
0.9288806431663574,Match(query=mek,ref=MEK,exact=False,space_mismatch=\
False,dash_mismatches=set(),cap_combos=[('all_lower', 'all_caps')]))

BRAT
----
Gilda implements a way to output annotation in a format appropriate for the
`BRAT Rapid Annotation Tool (BRAT) <https://brat.nlplab.org/index.html>`_.

>>> from gilda.ner import get_brat
>>> from pathlib import Path
>>> brat_string = get_brat(results)
>>> Path("results.ann").write_text(brat_string)
>>> Path("results.txt").write_text(text)

For brat to work, you need to store the text in a file with
the extension ``.txt`` and the annotations in a file with the
same name but extension ``.ann``.
"""

from typing import List, Set
import os

from nltk.tokenize import PunktSentenceTokenizer, TreebankWordTokenizer

from gilda import get_grounder
from gilda.grounder import Annotation
from gilda.process import normalize

__all__ = [
    "annotate",
    "get_brat",
    "stop_words"
]

RESOURCES_DIR = os.path.join(os.path.dirname(__file__), 'resources')
STOPLIST_PATH = os.path.join(RESOURCES_DIR, 'ner_stoplist.txt')
CORE_STOPWORDS_PATH = os.path.join(RESOURCES_DIR, 'core_stopwords.txt')


def _load_words(path: str) -> Set[str]:
    """Load a set of words from a file with one word per line."""
    with open(path) as file:
        return {line.strip() for line in file if line.strip()}


core_stop_words = _load_words(CORE_STOPWORDS_PATH)
stop_words = core_stop_words | _load_words(STOPLIST_PATH)


[docs]def annotate( text, *, grounder=None, sent_split_fun=None, organisms=None, namespaces=None, context_text: str = None, ) -> List[Annotation]: """Annotate a given text with Gilda. Parameters ---------- text : str The text to be annotated. grounder : gilda.grounder.Grounder, optional The Gilda grounder to use for grounding. sent_split_fun : Callable[str, Iterable[Tuple[int, int]]], optional A function that splits the text into sentences. The default is :func:`nltk.tokenize.PunktSentenceTokenizer.span_tokenize`. The function should take a string as input and return an iterable of coordinate pairs corresponding to the start and end coordinates for each sentence in the input text. organisms : list[str], optional A list of organism names to pass to the grounder. If not provided, human is used. namespaces : List[str], optional A list of namespaces to pass to the grounder to restrict the matches to. By default, no restriction is applied. context_text : A longer span of text that serves as additional context for the text being annotated for disambiguation purposes. Returns ------- List[Annotation] A list of Annotations where each contains as attributes the text span that was matched, the list of ScoredMatches, and the start and end character offsets of the text span. """ if grounder is None: grounder = get_grounder() if sent_split_fun is None: sent_tokenizer = PunktSentenceTokenizer() sent_split_fun = sent_tokenizer.span_tokenize # Get sentences sentence_coords = sent_split_fun(text) text_coord = 0 annotations = [] word_tokenizer = TreebankWordTokenizer() # FIXME: a custom sentence split function can be inconsistent # with the coordinates being used here which come from NLTK for sent_start, sent_end in sentence_coords: sentence = text[sent_start:sent_end] # FIXME: one rare corner case is named entities with single quotes # in them which get tokenized in a weird way raw_word_coords = \ list(word_tokenizer.span_tokenize(sentence.rstrip('.'))) raw_words = [sentence[start:end] for start, end in raw_word_coords] text_coord += len(sentence) + 1 words = [normalize(w) for w in raw_words] skip_until = 0 for idx, word in enumerate(words): if idx < skip_until: continue if word in core_stop_words: continue spans = grounder.prefix_index.get(word, set()) if not spans: continue # Only consider spans that are within the sentence applicable_spans = {span for span in spans if idx + span <= len(words)} # Find the largest matching span for span in sorted(applicable_spans, reverse=True): # If the span we are looking at is a single word # and it is a stopword then we skip this. This ensures # that we don't skip longer spans of which the stopword # is only a part. if span == 1 and word in stop_words: continue # We have to reconstruct a text span while adding spaces # where needed raw_span = '' for rw, c in zip(raw_words[idx:idx+span], raw_word_coords[idx:idx+span]): # Figure out if we need a space before this word, then # append the word. spaces = ' ' * (c[0] - len(raw_span) - raw_word_coords[idx][0]) raw_span += spaces + rw # If span is a single character, we don't want to consider it if len(raw_span) <= 1: continue context = text if context_text is None else context_text matches = grounder.ground(raw_span, context=context, organisms=organisms, namespaces=namespaces) if matches: start_coord = sent_start + raw_word_coords[idx][0] end_coord = sent_start + raw_word_coords[idx+span-1][1] annotations.append(Annotation( raw_span, matches, start_coord, end_coord )) skip_until = idx + span break return annotations
[docs]def get_brat(annotations, entity_type="Entity", ix_offset=1, include_text=True): """Return brat-formatted annotation strings for the given entities. Parameters ---------- annotations : list[Annotation] A list of named entity annotations in the text. entity_type : str, optional The brat entity type to use for the annotations. The default is 'Entity'. This is useful for differentiating between annotations in the same text extracted from different reading systems. ix_offset : int, optional The index offset to use for the brat annotations. The default is 1. include_text : bool, optional Whether to include the text of the entity in the brat annotations. The default is True. If not provided, the text that matches the span will be written to the annotation file. Returns ------- str A string containing the brat-formatted annotations. """ brat = [] ix_offset = max(1, ix_offset) for idx, annotation in enumerate(annotations, ix_offset): curie = annotation.matches[0].term.get_curie() if entity_type != "Entity": curie += f"; Reading system: {entity_type}" row = f'T{idx}\t{entity_type} {annotation.start} {annotation.end}' + ( f'\t{annotation.text}' if include_text else '' ) brat.append(row) row = f'#{idx}\tAnnotatorNotes T{idx}\t{curie}' brat.append(row) return '\n'.join(brat) + '\n'