"""Module containing various string processing functions used for grounding."""
from typing import List, Tuple
import regex as re
import unidecode
from .greek_alphabet import greek_alphabet, greek_to_latin
#: A list of all kinds of dashes
dashes = [chr(0x2212), chr(0x002d)] + [chr(c) for c in range(0x2010, 0x2016)]
[docs]def replace_dashes(s, rep='-'):
"""Replace all types of dashes in a given string with a given replacement.
Parameters
----------
s : str
The string in which all types of dashes should be replaced.
rep : Optional[str]
The string with which dashes should be replaced. By default, the plain
ASCII dash (-) is used.
Returns
-------
str
The string in which dashes have been replaced.
"""
for d in dashes:
s = s.replace(d, rep)
return s
[docs]def remove_dashes(s):
"""Remove all types of dashes in the given string.
Parameters
----------
s : str
The string in which all types of dashes should be replaced.
Returns
-------
str
The string from which dashes have been removed.
"""
return replace_dashes(s, '')
[docs]def replace_whitespace(s, rep=' '):
"""Replace any length white spaces in the given string with a replacement.
Parameters
----------
s : str
The string in which any length whitespaces should be replaced.
rep : Optional[str]
The string with which all whitespace should be replaced. By default,
the plain ASCII space ( ) is used.
Returns
-------
str
The string in which whitespaces have been replaced.
"""
s = re.sub(r'\s+', rep, s)
return s
[docs]def normalize(s):
"""Normalize white spaces, dashes and case of a given string.
Parameters
----------
s : str
The string to be normalized.
Returns
-------
str
The normalized string.
"""
s = replace_whitespace(s)
s = remove_dashes(s)
s = replace_unicode(s)
s = s.lower()
return s
[docs]def split_preserve_tokens(s):
"""Return split words of a string including the non-word tokens.
Parameters
----------
s : str
The string to be split.
Returns
-------
list of str
The list of words in the string including the separator tokens,
typically spaces and dashes..
"""
return re.split(r'(\W)', s)
[docs]def replace_greek_uni(s):
"""Replace Greek spelled out letters with their unicode character."""
for greek_uni, greek_spelled_out in greek_alphabet.items():
s = s.replace(greek_spelled_out, greek_uni)
return s
[docs]def replace_greek_latin(s):
"""Replace Greek spelled out letters with their latin character."""
for greek_spelled_out, latin in greek_to_latin.items():
s = s.replace(greek_spelled_out, latin)
return s
[docs]def replace_greek_spelled_out(s):
"""Replace Greek unicode character with latin spelled out.
"""
for greek_uni, greek_spelled_out in greek_alphabet.items():
s = s.replace(greek_uni, greek_spelled_out)
return s
[docs]def replace_unicode(s):
"""Replace unicode with ASCII equivalent, except Greek letters.
Greek letters are handled separately and aren't replaced in this context.
"""
if unidecode.unidecode(s) == s:
return s
return ''.join(unidecode.unidecode(c) if c not in greek_alphabet else c
for c in s)
[docs]def get_capitalization_pattern(word, beginning_of_sentence=False):
"""Return the type of capitalization for the string.
Parameters
----------
word : str
The word whose capitalization is determined.
beginning_of_sentence : Optional[bool]
True if the word appears at the beginning of a sentence. Default: False
Returns
-------
str
The capitalization pattern of the given word. Returns one of the
following: sentence_initial_cap, single_cap_letter, all_caps, all_lower,
initial_cap, mixed.
"""
if beginning_of_sentence and re.match(r'^\p{Lu}\p{Ll}*$', word):
return 'sentence_initial_cap'
elif re.match(r'^\p{Lu}$', word):
return 'single_cap_letter'
elif re.match(r'^\p{Lu}+$', word):
return 'all_caps'
elif re.match(r'^\p{Ll}+$', word):
return 'all_lower'
elif re.match(r'^\p{Lu}\p{Ll}+$', word):
return 'initial_cap'
else:
return 'mixed'
[docs]def depluralize(word: str) -> List[Tuple[str, str]]:
"""Return the depluralized version of the word, along with a status flag.
Parameters
----------
word : str
The word which is to be depluralized.
Returns
-------
list of str pairs:
The original word, if it is detected to be non-plural, or the
depluralized version of the word, and a status flag representing the
detected pluralization status of the
word, with non_plural (e.g., BRAF), plural_oes (e.g., mosquitoes),
plural_ies (e.g., antibodies), plural_es (e.g., switches),
plural_cap_s (e.g., MAPKs), and plural_s (e.g., receptors).
"""
# If the word doesn't end in s, we assume it's not plural
if not word.endswith('s'):
return [(word, 'non_plural')]
# Another case is words ending in -sis (e.g., apoptosis), these are almost
# exclusively non plural so we return here too
elif word.endswith('sis'):
return [(word, 'non_plural')]
# This is the case when the word ends with an o which is pluralized as oes
# e.g., mosquitoes
elif word.endswith('oes'):
return [(word[:-2], 'plural_oes'),
(word[:-1], 'plural_s')]
# This is the case when the word ends with a y which is pluralized as ies,
# e.g., antibodies
elif word.endswith('ies'):
return [(word[:-3] + 'y', 'plural_ies'),
(word[:-1], 'plural_s')]
# These are the cases where words form plurals by adding -es so we
# return by stripping it off. However, it's not possible to determine
# if the word doesn't end in e.g., -xe or -se in a singluar form, and
# so we also return a variant to account for this.
elif word.endswith(('xes', 'ses', 'ches', 'shes')):
return [(word[:-2], 'plural_es'), (word[:-1], 'plural_s')]
# If the word is all caps and the last letter is an s, then it's a very
# strong signal that it is pluralized so we have a custom return value
# for that
elif re.match(r'^\p{Lu}+$', word[:-1]):
return [(word[:-1], 'plural_caps_s')]
# Otherwise, we just go with the assumption that the last s is the
# plural marker
else:
return [(word[:-1], 'plural_s')]
# Note: there don't seem to be any compelling examples of -f or -fe -> ves
# so it is not implemented
def replace_roman_arabic(s):
match = roman_arabic_prefilter.match(s)
if not match:
return s
else:
pattern = roman_arabic_patterns.get(match.groups()[0].upper())
return pattern[0].sub(pattern[1], s) if pattern else s
def _make_roman_arabic_patterns():
roman_arabic = {
'I': '1',
'II': '2',
'III': '3',
'IV': '4',
'V': '5',
'VI': '6',
'VII': '7',
'VIII': '8',
'IX': '9',
'X': '10'
}
roman_arabic_patterns = {}
for r, a in roman_arabic.items():
for a, b in [(r, a), (a, r)]:
roman_arabic_patterns[a] = (re.compile(r'^(.*[- ])(%s)$' % a,
re.IGNORECASE),
r'\g<1>%s' % b)
return roman_arabic_patterns
roman_arabic_patterns = _make_roman_arabic_patterns()
roman_arabic_prefilter = re.compile(r'^.*[- ](\d+|[IXV]+)$', re.IGNORECASE)