Source code for gilda.scorer

from copy import deepcopy
from .process import replace_dashes, replace_whitespace, normalize, \
    get_capitalization_pattern

__all__ = [
    "Match",
    "generate_match",
    "score_string_match",
    "score_status",
    "score",
]


[docs]class Match(object):
    """Class representing a match between a query and a reference string"""
    def __init__(self, query, ref, exact=None, space_mismatch=None,
                 dash_mismatches=None, cap_combos=None):
        self.query = query
        self.ref = ref
        self.exact = exact if exact is not None else False
        self.space_mismatch = space_mismatch if space_mismatch is not None \
            else False
        self.dash_mismatches = dash_mismatches if dash_mismatches is not None \
            else {}
        self.cap_combos = cap_combos if cap_combos is not None else []

    def __str__(self):
        return 'Match(%s)' % (','.join(['%s=%s' % (k, v) for k, v in
                                        self.__dict__.items()]))

    def __repr__(self):
        return str(self)

    def to_json(self):
        return {
            'query': self.query,
            'ref': self.ref,
            'exact': self.exact,
            'space_mismatch': self.space_mismatch,
            'dash_mismatches': list(self.dash_mismatches),
            'cap_combos': self.cap_combos
        }

    def _query_cases(self):
        return {c[0] for c in self.cap_combos}

    def _ref_cases(self):
        return {c[1] for c in self.cap_combos}

    def score_short_abbr(self):
        if len(self.ref) <= 3 and \
                (('all_caps', 'all_lower') in self.cap_combos or
                 ('all_lower', 'all_caps') in self.cap_combos):
            return 0
        else:
            return 1

    def score_mixed(self):
        if ('mixed', 'mixed') in self.cap_combos:
            return 0
        elif ('mixed' in self._query_cases()) or \
                ('mixed' in self._ref_cases()):
            return 1
        else:
            return 2

    def score_exact(self):
        return 1 if self.exact is True else 0

    def score_acic(self):
        if self.exact is True and not self.cap_combos:
            return 2
        elif set(self.cap_combos) == {('sentence_initial', 'all_lower')}:
            return 2
        if self.exact is True and set(self.cap_combos) <= \
                {('all_caps', 'sentence_initial_cap'),
                 ('sentence_initial_cap', 'all_caps')}:
            return 1
        else:
            return 0

    def score_combo(self):
        qc = self._query_cases()
        rc = self._ref_cases()
        query_combo = 4 - len(qc)
        ref_combo = 4 - len(rc)
        if 'single_cap_letter' in qc and \
                ('all_caps' in qc or 'initial_cap' in qc):
            query_combo += 1
        if 'single_cap_letter' in rc and \
                ('all_caps' in rc or 'initial_cap' in rc):
            ref_combo += 1
        if 'sentence_initial_cap' in qc and \
                (len(qc) == 1 and
                 ('sentence_initial_cap', 'all_lower') in self.cap_combos) \
                or \
                 {'single_cap_letter', 'initial_cap', 'all_lower'} & qc:
            query_combo += 1
        combo = max(query_combo, ref_combo)
        return combo

    def score_dash(self):
        return 2 - len(self.dash_mismatches)


[docs]def generate_match(query, ref, beginning_of_sentence=False):
    """Return a match data structure based on comparing a query to a ref str.

    Parameters
    ----------
    query : str
        The string to be compared against a reference string.
    ref : str
        The reference string against which the incoming query string is
        compared.
    beginning_of_sentence : bool
        True if the query_str appears at the beginning of a sentence, relevant
        for how capitalization is evaluated.

    Returns
    -------
    Match
        A Match object characterizing the match between the two strings.
    """
    # Pre-process both strings first by replacing multiple white spaces
    # with a single ASCII space, and all kinds of dashes with a single
    # ASCII dash.
    query = replace_dashes(replace_whitespace(query))
    # Corner case: some synonyms may have a trailing space which we can
    # safely remove to avoid issues in the comparison
    ref = replace_whitespace(ref).rstrip(' ')
    ref = replace_dashes(ref)

    # If we have an exact match at this point then we can return immediately
    if not beginning_of_sentence and query == ref:
        return Match(query, ref, exact=True)
    query_suffix = query
    ref_suffix = ref
    query_pieces = ['']
    ref_pieces = ['']
    dash_mismatches = set()
    while query_suffix and ref_suffix:
        # Deal with spaces first
        qs = (query_suffix[0] == ' ')
        rs = (ref_suffix[0] == ' ')
        # If both have spaces, we start new pieces and skip the spaces
        if qs and rs:
            query_suffix = query_suffix[1:]
            ref_suffix = ref_suffix[1:]
            query_pieces.append('')
            ref_pieces.append('')
        # This means that there is a space inconsistency which we don't allow
        # and return immediately
        elif qs and not rs or rs and not qs:
            return Match(query, ref, space_mismatch=True)

        # We next deal with dashes
        qd = (query_suffix[0] == '-')
        rd = (ref_suffix[0] == '-')
        # If both are dashes, we skip them
        if qd and rd:
            query_suffix = query_suffix[1:]
            ref_suffix = ref_suffix[1:]
            query_pieces.append('')
            ref_pieces.append('')
        # If there is a mismatch, we introduce new pieces but only skip the one
        # dash and record the inconsistency
        elif qd and not rd:
            dash_mismatches.add('query')
            query_suffix = query_suffix[1:]
            query_pieces.append('')
            ref_pieces.append('')
        elif not qd and rd:
            dash_mismatches.add('ref')
            ref_suffix = ref_suffix[1:]
            query_pieces.append('')
            ref_pieces.append('')
        # Otherwise both strings start with a non space/dash character that we
        # add to the latest piece
        else:
            query_pieces[-1] += query_suffix[0]
            ref_pieces[-1] += ref_suffix[0]
            ref_suffix = ref_suffix[1:]
            query_suffix = query_suffix[1:]

    # Now that we have the final pieces in place, we can count the matches and
    # capitalization relationships
    combinations = []
    first = True
    exact = False
    for qp, rp in zip(query_pieces, ref_pieces):
        first_bos = first and beginning_of_sentence
        first = False
        if qp == rp and not first_bos:
            exact = True
        else:
            qcp = get_capitalization_pattern(qp, first_bos)
            rcp = get_capitalization_pattern(rp, False)
            if qcp == rcp and qp == rp:
                exact = True
            else:
                combinations.append((qcp, rcp))
    return Match(query, ref, dash_mismatches=dash_mismatches,
                 exact=exact, cap_combos=combinations)


[docs]def score_string_match(match):
    """Return a score between 0 and 1 for the goodness of a match.

    This score is purely based on the relationship of the two strings and
    does not take the status of the reference into account.

    Parameters
    ----------
    match : gilda.scorer.Match
        The Match object characterizing the relationship of the query and
        reference strings.

    Returns
    -------
    float
        A match score between 0 and 1.
    """
    terms = [
        (match.score_short_abbr, 2),
        (match.score_mixed, 3),
        (match.score_exact, 2),
        (match.score_acic, 3),
        (match.score_combo, 5),
        (match.score_dash, 3)
    ]
    score = 0
    norm = 1
    for fun, coeff in terms:
        score = coeff * score + fun()
        norm *= coeff
    score /= (norm - 1)
    return score


def score_status(term):
    scores = {
        'curated': 4,
        'name': 3,
        'synonym': 2,
        'former_name': 1,
    }
    return scores[term.status]


def score(match, term):
    string_match_score = score_string_match(match)
    status_score = score_status(term)
    score = ((0 * 5 + status_score) * 2 + string_match_score) / 9
    return score