Source code for gilda.scorer

from copy import deepcopy
from .process import replace_dashes, replace_whitespace, normalize, \
    get_capitalization_pattern

__all__ = [
    "Match",
    "generate_match",
    "score_string_match",
    "score_status",
    "score",
]


[docs]class Match(object): """Class representing a match between a query and a reference string""" def __init__(self, query, ref, exact=None, space_mismatch=None, dash_mismatches=None, cap_combos=None): self.query = query self.ref = ref self.exact = exact if exact is not None else False self.space_mismatch = space_mismatch if space_mismatch is not None \ else False self.dash_mismatches = dash_mismatches if dash_mismatches is not None \ else {} self.cap_combos = cap_combos if cap_combos is not None else [] def __str__(self): return 'Match(%s)' % (','.join(['%s=%s' % (k, v) for k, v in self.__dict__.items()])) def __repr__(self): return str(self) def to_json(self): return { 'query': self.query, 'ref': self.ref, 'exact': self.exact, 'space_mismatch': self.space_mismatch, 'dash_mismatches': list(self.dash_mismatches), 'cap_combos': self.cap_combos } def _query_cases(self): return {c[0] for c in self.cap_combos} def _ref_cases(self): return {c[1] for c in self.cap_combos} def score_short_abbr(self): if len(self.ref) <= 3 and \ (('all_caps', 'all_lower') in self.cap_combos or ('all_lower', 'all_caps') in self.cap_combos): return 0 else: return 1 def score_mixed(self): if ('mixed', 'mixed') in self.cap_combos: return 0 elif ('mixed' in self._query_cases()) or \ ('mixed' in self._ref_cases()): return 1 else: return 2 def score_exact(self): return 1 if self.exact is True else 0 def score_acic(self): if self.exact is True and not self.cap_combos: return 2 elif set(self.cap_combos) == {('sentence_initial', 'all_lower')}: return 2 if self.exact is True and set(self.cap_combos) <= \ {('all_caps', 'sentence_initial_cap'), ('sentence_initial_cap', 'all_caps')}: return 1 else: return 0 def score_combo(self): qc = self._query_cases() rc = self._ref_cases() query_combo = 4 - len(qc) ref_combo = 4 - len(rc) if 'single_cap_letter' in qc and \ ('all_caps' in qc or 'initial_cap' in qc): query_combo += 1 if 'single_cap_letter' in rc and \ ('all_caps' in rc or 'initial_cap' in rc): ref_combo += 1 if 'sentence_initial_cap' in qc and \ (len(qc) == 1 and ('sentence_initial_cap', 'all_lower') in self.cap_combos) \ or \ {'single_cap_letter', 'initial_cap', 'all_lower'} & qc: query_combo += 1 combo = max(query_combo, ref_combo) return combo def score_dash(self): return 2 - len(self.dash_mismatches)
[docs]def generate_match(query, ref, beginning_of_sentence=False): """Return a match data structure based on comparing a query to a ref str. Parameters ---------- query : str The string to be compared against a reference string. ref : str The reference string against which the incoming query string is compared. beginning_of_sentence : bool True if the query_str appears at the beginning of a sentence, relevant for how capitalization is evaluated. Returns ------- Match A Match object characterizing the match between the two strings. """ # Pre-process both strings first by replacing multiple white spaces # with a single ASCII space, and all kinds of dashes with a single # ASCII dash. query = replace_dashes(replace_whitespace(query)) ref = replace_dashes(replace_whitespace(ref)) # If we have an exact match at this point then we can return immediately if not beginning_of_sentence and query == ref: return Match(query, ref, exact=True) query_suffix = query ref_suffix = ref query_pieces = [''] ref_pieces = [''] dash_mismatches = set() while query_suffix and ref_suffix: # Deal with spaces first qs = (query_suffix[0] == ' ') rs = (ref_suffix[0] == ' ') # If both have spaces, we start new pieces and skip the spaces if qs and rs: query_suffix = query_suffix[1:] ref_suffix = ref_suffix[1:] query_pieces.append('') ref_pieces.append('') # This means that there is a space inconsistency which we don't allow # and return immediately elif qs and not rs or rs and not qs: return Match(query, ref, space_mismatch=True) # We next deal with dashes qd = (query_suffix[0] == '-') rd = (ref_suffix[0] == '-') # If both are dashes, we skip them if qd and rd: query_suffix = query_suffix[1:] ref_suffix = ref_suffix[1:] query_pieces.append('') ref_pieces.append('') # If there is a mismatch, we introduce new pieces but only skip the one # dash and record the inconsistency elif qd and not rd: dash_mismatches.add('query') query_suffix = query_suffix[1:] query_pieces.append('') ref_pieces.append('') elif not qd and rd: dash_mismatches.add('ref') ref_suffix = ref_suffix[1:] query_pieces.append('') ref_pieces.append('') # Otherwise both strings start with a non space/dash character that we # add to the latest piece else: query_pieces[-1] += query_suffix[0] ref_pieces[-1] += ref_suffix[0] ref_suffix = ref_suffix[1:] query_suffix = query_suffix[1:] # Now that we have the final pieces in place, we can count the matches and # capitalization relationships combinations = [] first = True exact = False for qp, rp in zip(query_pieces, ref_pieces): first_bos = first and beginning_of_sentence first = False if qp == rp and not first_bos: exact = True else: qcp = get_capitalization_pattern(qp, first_bos) rcp = get_capitalization_pattern(rp, False) if qcp == rcp and qp == rp: exact = True else: combinations.append((qcp, rcp)) return Match(query, ref, dash_mismatches=dash_mismatches, exact=exact, cap_combos=combinations)
[docs]def score_string_match(match): """Return a score between 0 and 1 for the goodness of a match. This score is purely based on the relationship of the two strings and does not take the status of the reference into account. Parameters ---------- match : gilda.scorer.Match The Match object characterizing the relationship of the query and reference strings. Returns ------- float A match score between 0 and 1. """ terms = [ (match.score_short_abbr, 2), (match.score_mixed, 3), (match.score_exact, 2), (match.score_acic, 3), (match.score_combo, 5), (match.score_dash, 3) ] score = 0 norm = 1 for fun, coeff in terms: score = coeff * score + fun() norm *= coeff score /= (norm - 1) return score
def score_status(term): scores = { 'curated': 4, 'name': 3, 'synonym': 2, 'former_name': 1, } return scores[term.status] def score(match, term): string_match_score = score_string_match(match) status_score = score_status(term) score = ((0 * 5 + status_score) * 2 + string_match_score) / 9 return score