"""
Gilda implements a simple dictionary-based named entity
recognition (NER) algorithm. It can be used as follows:
>>> from gilda.ner import annotate
>>> text = "MEK phosphorylates ERK"
>>> results = annotate(text)
The results are a list of Annotation objects each of which contains:
- the `text` string matched
- a list of :class:`gilda.grounder.ScoredMatch` instances containing a sorted list of matches
for the given text span (first one is the best match)
- the `start` position in the text string where the entity starts
- the `end` position in the text string where the entity ends
In this example, the two concepts are grounded to FamPlex entries.
>>> results[0].text, results[0].matches[0].term.get_curie(), results[0].start, results[0].end
('MEK', 'fplx:MEK', 0, 3)
>>> results[1].text, results[1].matches[0].term.get_curie(), results[1].start, results[1].end
('ERK', 'fplx:ERK', 19, 22)
If you directly look in the second part of the 4-tuple, you get a full
description of the match itself:
>>> results[0].matches[0]
ScoredMatch(Term(mek,MEK,FPLX,MEK,MEK,curated,famplex,None,None,None),\
0.9288806431663574,Match(query=mek,ref=MEK,exact=False,space_mismatch=\
False,dash_mismatches=set(),cap_combos=[('all_lower', 'all_caps')]))
BRAT
----
Gilda implements a way to output annotation in a format appropriate for the
`BRAT Rapid Annotation Tool (BRAT) <https://brat.nlplab.org/index.html>`_.
>>> from gilda.ner import get_brat
>>> from pathlib import Path
>>> brat_string = get_brat(results)
>>> Path("results.ann").write_text(brat_string)
>>> Path("results.txt").write_text(text)
For brat to work, you need to store the text in a file with
the extension ``.txt`` and the annotations in a file with the
same name but extension ``.ann``.
"""
from typing import List, Set
import os
from nltk.tokenize import PunktSentenceTokenizer, TreebankWordTokenizer
from gilda import get_grounder
from gilda.grounder import Annotation
from gilda.process import normalize
__all__ = [
"annotate",
"get_brat",
"stop_words"
]
RESOURCES_DIR = os.path.join(os.path.dirname(__file__), 'resources')
STOPLIST_PATH = os.path.join(RESOURCES_DIR, 'ner_stoplist.txt')
CORE_STOPWORDS_PATH = os.path.join(RESOURCES_DIR, 'core_stopwords.txt')
def _load_words(path: str) -> Set[str]:
"""Load a set of words from a file with one word per line."""
with open(path) as file:
return {line.strip() for line in file if line.strip()}
core_stop_words = _load_words(CORE_STOPWORDS_PATH)
stop_words = core_stop_words | _load_words(STOPLIST_PATH)
[docs]def annotate(
text, *,
grounder=None,
sent_split_fun=None,
organisms=None,
namespaces=None,
context_text: str = None,
) -> List[Annotation]:
"""Annotate a given text with Gilda.
Parameters
----------
text : str
The text to be annotated.
grounder : gilda.grounder.Grounder, optional
The Gilda grounder to use for grounding.
sent_split_fun : Callable[str, Iterable[Tuple[int, int]]], optional
A function that splits the text into sentences. The default is
:func:`nltk.tokenize.PunktSentenceTokenizer.span_tokenize`. The function
should take a string as input and return an iterable of coordinate pairs
corresponding to the start and end coordinates for each sentence in the
input text.
organisms : list[str], optional
A list of organism names to pass to the grounder. If not provided,
human is used.
namespaces : List[str], optional
A list of namespaces to pass to the grounder to restrict the matches
to. By default, no restriction is applied.
context_text :
A longer span of text that serves as additional context for the text
being annotated for disambiguation purposes.
Returns
-------
List[Annotation]
A list of Annotations where each contains as attributes
the text span that was matched, the list of ScoredMatches, and the
start and end character offsets of the text span.
"""
if grounder is None:
grounder = get_grounder()
if sent_split_fun is None:
sent_tokenizer = PunktSentenceTokenizer()
sent_split_fun = sent_tokenizer.span_tokenize
# Get sentences
sentence_coords = sent_split_fun(text)
text_coord = 0
annotations = []
word_tokenizer = TreebankWordTokenizer()
# FIXME: a custom sentence split function can be inconsistent
# with the coordinates being used here which come from NLTK
for sent_start, sent_end in sentence_coords:
sentence = text[sent_start:sent_end]
# FIXME: one rare corner case is named entities with single quotes
# in them which get tokenized in a weird way
raw_word_coords = \
list(word_tokenizer.span_tokenize(sentence.rstrip('.')))
raw_words = [sentence[start:end] for start, end in raw_word_coords]
text_coord += len(sentence) + 1
words = [normalize(w) for w in raw_words]
skip_until = 0
for idx, word in enumerate(words):
if idx < skip_until:
continue
if word in core_stop_words:
continue
spans = grounder.prefix_index.get(word, set())
if not spans:
continue
# Only consider spans that are within the sentence
applicable_spans = {span for span in spans
if idx + span <= len(words)}
# Find the largest matching span
for span in sorted(applicable_spans, reverse=True):
# If the span we are looking at is a single word
# and it is a stopword then we skip this. This ensures
# that we don't skip longer spans of which the stopword
# is only a part.
if span == 1 and word in stop_words:
continue
# We have to reconstruct a text span while adding spaces
# where needed
raw_span = ''
for rw, c in zip(raw_words[idx:idx+span],
raw_word_coords[idx:idx+span]):
# Figure out if we need a space before this word, then
# append the word.
spaces = ' ' * (c[0] - len(raw_span) -
raw_word_coords[idx][0])
raw_span += spaces + rw
# If span is a single character, we don't want to consider it
if len(raw_span) <= 1:
continue
context = text if context_text is None else context_text
matches = grounder.ground(raw_span,
context=context,
organisms=organisms,
namespaces=namespaces)
if matches:
start_coord = sent_start + raw_word_coords[idx][0]
end_coord = sent_start + raw_word_coords[idx+span-1][1]
annotations.append(Annotation(
raw_span, matches, start_coord, end_coord
))
skip_until = idx + span
break
return annotations
[docs]def get_brat(annotations, entity_type="Entity", ix_offset=1, include_text=True):
"""Return brat-formatted annotation strings for the given entities.
Parameters
----------
annotations : list[Annotation]
A list of named entity annotations in the text.
entity_type : str, optional
The brat entity type to use for the annotations. The default is
'Entity'. This is useful for differentiating between annotations in
the same text extracted from different reading systems.
ix_offset : int, optional
The index offset to use for the brat annotations. The default is 1.
include_text : bool, optional
Whether to include the text of the entity in the brat annotations.
The default is True. If not provided, the text that matches the span
will be written to the annotation file.
Returns
-------
str
A string containing the brat-formatted annotations.
"""
brat = []
ix_offset = max(1, ix_offset)
for idx, annotation in enumerate(annotations, ix_offset):
curie = annotation.matches[0].term.get_curie()
if entity_type != "Entity":
curie += f"; Reading system: {entity_type}"
row = f'T{idx}\t{entity_type} {annotation.start} {annotation.end}' + (
f'\t{annotation.text}' if include_text else ''
)
brat.append(row)
row = f'#{idx}\tAnnotatorNotes T{idx}\t{curie}'
brat.append(row)
return '\n'.join(brat) + '\n'