Source code for gilda.process

"""Module containing various string processing functions used for grounding."""
from typing import List, Tuple

import regex as re
import unidecode

from .greek_alphabet import greek_alphabet, greek_to_latin

#: A list of all kinds of dashes
dashes = [chr(0x2212), chr(0x002d)] + [chr(c) for c in range(0x2010, 0x2016)]

[docs]def replace_dashes(s, rep='-'): """Replace all types of dashes in a given string with a given replacement. Parameters ---------- s : str The string in which all types of dashes should be replaced. rep : Optional[str] The string with which dashes should be replaced. By default, the plain ASCII dash (-) is used. Returns ------- str The string in which dashes have been replaced. """ for d in dashes: s = s.replace(d, rep) return s
[docs]def remove_dashes(s): """Remove all types of dashes in the given string. Parameters ---------- s : str The string in which all types of dashes should be replaced. Returns ------- str The string from which dashes have been removed. """ return replace_dashes(s, '')
[docs]def replace_whitespace(s, rep=' '): """Replace any length white spaces in the given string with a replacement. Parameters ---------- s : str The string in which any length whitespaces should be replaced. rep : Optional[str] The string with which all whitespace should be replaced. By default, the plain ASCII space ( ) is used. Returns ------- str The string in which whitespaces have been replaced. """ s = re.sub(r'\s+', rep, s) return s
[docs]def normalize(s): """Normalize white spaces, dashes and case of a given string. Parameters ---------- s : str The string to be normalized. Returns ------- str The normalized string. """ s = replace_whitespace(s) s = remove_dashes(s) s = replace_unicode(s) s = s.lower() return s
[docs]def split_preserve_tokens(s): """Return split words of a string including the non-word tokens. Parameters ---------- s : str The string to be split. Returns ------- list of str The list of words in the string including the separator tokens, typically spaces and dashes.. """ return re.split(r'(\W)', s)
[docs]def replace_greek_uni(s): """Replace Greek spelled out letters with their unicode character.""" for greek_uni, greek_spelled_out in greek_alphabet.items(): s = s.replace(greek_spelled_out, greek_uni) return s
[docs]def replace_greek_latin(s): """Replace Greek spelled out letters with their latin character.""" for greek_spelled_out, latin in greek_to_latin.items(): s = s.replace(greek_spelled_out, latin) return s
[docs]def replace_greek_spelled_out(s): """Replace Greek unicode character with latin spelled out. """ for greek_uni, greek_spelled_out in greek_alphabet.items(): s = s.replace(greek_uni, greek_spelled_out) return s
[docs]def replace_unicode(s): """Replace unicode with ASCII equivalent, except Greek letters. Greek letters are handled separately and aren't replaced in this context. """ if unidecode.unidecode(s) == s: return s return ''.join(unidecode.unidecode(c) if c not in greek_alphabet else c for c in s)
[docs]def get_capitalization_pattern(word, beginning_of_sentence=False): """Return the type of capitalization for the string. Parameters ---------- word : str The word whose capitalization is determined. beginning_of_sentence : Optional[bool] True if the word appears at the beginning of a sentence. Default: False Returns ------- str The capitalization pattern of the given word. Returns one of the following: sentence_initial_cap, single_cap_letter, all_caps, all_lower, initial_cap, mixed. """ if beginning_of_sentence and re.match(r'^\p{Lu}\p{Ll}*$', word): return 'sentence_initial_cap' elif re.match(r'^\p{Lu}$', word): return 'single_cap_letter' elif re.match(r'^\p{Lu}+$', word): return 'all_caps' elif re.match(r'^\p{Ll}+$', word): return 'all_lower' elif re.match(r'^\p{Lu}\p{Ll}+$', word): return 'initial_cap' else: return 'mixed'
[docs]def depluralize(word: str) -> List[Tuple[str, str]]: """Return the depluralized version of the word, along with a status flag. Parameters ---------- word : str The word which is to be depluralized. Returns ------- list of str pairs: The original word, if it is detected to be non-plural, or the depluralized version of the word, and a status flag representing the detected pluralization status of the word, with non_plural (e.g., BRAF), plural_oes (e.g., mosquitoes), plural_ies (e.g., antibodies), plural_es (e.g., switches), plural_cap_s (e.g., MAPKs), and plural_s (e.g., receptors). """ # If the word doesn't end in s, we assume it's not plural if not word.endswith('s'): return [(word, 'non_plural')] # Another case is words ending in -sis (e.g., apoptosis), these are almost # exclusively non plural so we return here too elif word.endswith('sis'): return [(word, 'non_plural')] # This is the case when the word ends with an o which is pluralized as oes # e.g., mosquitoes elif word.endswith('oes'): return [(word[:-2], 'plural_oes'), (word[:-1], 'plural_s')] # This is the case when the word ends with a y which is pluralized as ies, # e.g., antibodies elif word.endswith('ies'): return [(word[:-3] + 'y', 'plural_ies'), (word[:-1], 'plural_s')] # These are the cases where words form plurals by adding -es so we # return by stripping it off. However, it's not possible to determine # if the word doesn't end in e.g., -xe or -se in a singluar form, and # so we also return a variant to account for this. elif word.endswith(('xes', 'ses', 'ches', 'shes')): return [(word[:-2], 'plural_es'), (word[:-1], 'plural_s')] # If the word is all caps and the last letter is an s, then it's a very # strong signal that it is pluralized so we have a custom return value # for that elif re.match(r'^\p{Lu}+$', word[:-1]): return [(word[:-1], 'plural_caps_s')] # Otherwise, we just go with the assumption that the last s is the # plural marker else: return [(word[:-1], 'plural_s')]
# Note: there don't seem to be any compelling examples of -f or -fe -> ves # so it is not implemented def replace_roman_arabic(s): match = roman_arabic_prefilter.match(s) if not match: return s else: pattern = roman_arabic_patterns.get(match.groups()[0].upper()) return pattern[0].sub(pattern[1], s) if pattern else s def _make_roman_arabic_patterns(): roman_arabic = { 'I': '1', 'II': '2', 'III': '3', 'IV': '4', 'V': '5', 'VI': '6', 'VII': '7', 'VIII': '8', 'IX': '9', 'X': '10' } roman_arabic_patterns = {} for r, a in roman_arabic.items(): for a, b in [(r, a), (a, r)]: roman_arabic_patterns[a] = (re.compile(r'^(.*[- ])(%s)$' % a, re.IGNORECASE), r'\g<1>%s' % b) return roman_arabic_patterns roman_arabic_patterns = _make_roman_arabic_patterns() roman_arabic_prefilter = re.compile(r'^.*[- ](\d+|[IXV]+)$', re.IGNORECASE)