Source code for gilda.pandas_utils

"""Utilities for Pandas."""

from functools import partial
from typing import Optional, Union, TYPE_CHECKING

from .grounder import Grounder
from . import api

if TYPE_CHECKING:
    import pandas

__all__ = [
    "ground_df",
    "ground_df_map",
]


[docs]def ground_df( df: "pandas.DataFrame", source_column: Union[str, int], *, target_column: Union[None, str, int] = None, grounder: Optional[Grounder] = None, **kwargs, ) -> None: """ Ground the elements of a column in a Pandas dataframe as CURIEs, in-place. Parameters ---------- df : A pandas dataframe source_column : The column to ground. This column contains text corresponding to named entities' labels or synonyms target_column : The column where to put the groundings (either a CURIE string, or None). It's possible to create a new column when passing a string for this argument. If not given, will create a new column name like ``<source column>_grounded``. grounder : A custom grounder. If none given, uses the built-in grounder. kwargs : Keyword arguments passed to :meth:`Grounder.ground`, could include context, organisms, or namespaces. Examples -------- The following example shows how to use this function. .. code-block:: python import pandas as pd import gilda url = "https://raw.githubusercontent.com/OBOAcademy/obook/master/docs/tutorial/linking_data/data.csv" df = pd.read_csv(url) gilda.ground_df(df, source_column="disease", target_column="disease_curie") """ if target_column is None: target_column = f"{source_column}_grounded" df[target_column] = ground_df_map( df=df, source_column=source_column, grounder=grounder, **kwargs, )
[docs]def ground_df_map( df: "pandas.DataFrame", source_column: Union[str, int], *, grounder: Optional[Grounder] = None, **kwargs, ) -> "pandas.Series": """ Ground the elements of a column in a Pandas dataframe as CURIEs. Parameters ---------- df : A pandas dataframe source_column : The column to ground. This column contains text corresponding to named entities' labels or synonyms grounder : A custom grounder. If none given, uses the built-in ground. kwargs : Keyword arguments passed to :meth:`Grounder.ground`, could include context, organisms, or namespaces. Returns ------- series : A pandas series representing the grounded CURIE strings. Contains NaNs if grounding was not successful or if there was an NaN in the cell before. """ if grounder is None: grounder = api.grounder func = partial(_ground_helper, grounder=grounder, **kwargs) series = df[source_column].map(func) return series
def _ground_helper(text, grounder: Grounder, **kwargs) -> Optional[str]: if not isinstance(text, str): return None scored_matches = grounder.ground(text, **kwargs) if not scored_matches: return None return scored_matches[0].term.get_curie()