Skip to content

Commit

Permalink
feat(api): Added bdi.materialize_mapping() and basic value mappers
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed Jun 7, 2024
1 parent ca8f0b8 commit 6db5dbb
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 0 deletions.
23 changes: 23 additions & 0 deletions bdikit/functional_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
JaccardDistanceAlgorithm,
GPTAlgorithm,
)
from bdikit.mapping_algorithms.value_mapping.value_mappers import ValueMapper
from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_api import (
ContrastiveLearningAPI,
)
Expand Down Expand Up @@ -113,3 +114,25 @@ def top_matches(
dfs.append(matches.sort_values(by="similarity", ascending=False))

return pd.concat(dfs, ignore_index=True)


def materialize_mapping(
input_dataframe: pd.DataFrame, target: List[dict]
) -> pd.DataFrame:
output_dataframe = pd.DataFrame()
for mapping_spec in target:
from_column_name = mapping_spec["from"]
to_column_name = mapping_spec["to"]
value_mapper = mapping_spec["mapper"]
output_dataframe[to_column_name] = map_column_values(
input_dataframe[from_column_name], to_column_name, value_mapper
)
return output_dataframe


def map_column_values(
input_column: pd.Series, target: str, value_mapper: ValueMapper
) -> pd.Series:
new_column = value_mapper.map(input_column)
new_column.name = target
return new_column
62 changes: 62 additions & 0 deletions bdikit/mapping_algorithms/value_mapping/value_mappers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pandas as pd


class ValueMapper:
"""
A ValueMapper represents objects that transform the values in a input
column to the values from a new output column.
"""

def map(self, input_column: pd.Series) -> pd.Series:
"""
Every concrete ValueMapper should implement this method, which takes a
pandas Series as input and returns a new pandas Series with transformed
values.
"""
pass


class IdentityValueMapper(ValueMapper):
"""
A column mapper that maps each value in input column into itself.
"""

def map(self, input_column: pd.Series) -> pd.Series:
"""
Simply copies the values in input_column to the output column.
"""
return input_column.copy()


class FunctionValueMapper(ValueMapper):
"""
A column mapper that transforms each value in the input column using the
provided custom function.
"""

def __init__(self, function):
self.function = function

def map(self, input_column: pd.Series) -> pd.Series:
"""
Applies the given function to each value in input_column to generate
the output column.
"""
return input_column.map(self.function)


class DictionaryMapper(ValueMapper):
"""
A column mapper that transforms each value in the input column using the
values stored in the provided dictionary.
"""

def __init__(self, dictionary: dict):
self.dictionary = dictionary

def map(self, input_column: pd.Series) -> pd.Series:
"""
Transforms the values in the input_column to the values specified in
the dictionary provided using the object constructor.
"""
return input_column.map(self.dictionary)
57 changes: 57 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import bdikit as bdi
import pandas as pd
from bdikit.mapping_algorithms.value_mapping.value_mappers import (
FunctionValueMapper,
IdentityValueMapper,
)


def test_bdi_match_columns_with_dataframes():
Expand Down Expand Up @@ -124,3 +128,56 @@ def test_bdi_top_matches_gdc():
assert len(df_matches[df_filter]) == 5
assert "ethnicity" in df_matches[df_filter]["matches"].tolist()
assert "race" in df_matches[df_filter]["matches"].tolist()


def test_map_column_values():
"""
Ensures that the map_column_values function correctly maps the values of
a column and assings the target name.
"""
# given
str_column = pd.Series(data=["a", "b", "c", "d", "e"], name="column_str")
value_mapper = FunctionValueMapper(function=lambda x: x.upper())
target_column_name = "string column"

# when
mapped_column = bdi.map_column_values(
str_column, target=target_column_name, value_mapper=value_mapper
)

# then
upper_cased_values = ["A", "B", "C", "D", "E"]
assert mapped_column.name == target_column_name
assert mapped_column.eq(upper_cased_values).all()


def test_map_dataframe_column_values():
# given
str_column_1 = ["a", "b", "c", "d", "e"]
str_column_2 = ["a", "b", "c", "d", "e"]
df_base = pd.DataFrame({"column_str_1": str_column_1, "column_str_2": str_column_2})

value_mapping_spec = [
{
"from": "column_str_1",
"to": "string column 1",
"mapper": IdentityValueMapper(),
},
{
"from": "column_str_2",
"to": "string column 2",
"mapper": FunctionValueMapper(function=lambda x: x.upper()),
},
]

# when
df_mapped = bdi.materialize_mapping(df_base, target=value_mapping_spec)

# then
assert len(df_mapped.columns) == 2

assert "string column 1" in df_mapped.columns
assert df_mapped["string column 1"].eq(str_column_1).all()

assert "string column 2" in df_mapped.columns
assert df_mapped["string column 2"].eq(["A", "B", "C", "D", "E"]).all()
42 changes: 42 additions & 0 deletions tests/test_value_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import pandas as pd
from bdikit.mapping_algorithms.value_mapping import (
FunctionValueMapper,
DictionaryMapper,
IdentityValueMapper,
)


def test_identity_mapper():
# given
str_column = pd.Series(data=["a", "b", "c", "d", "e"], name="column_str")
identity_mapper = IdentityValueMapper()

# when
mapped_column = identity_mapper.map(str_column)

# then
assert mapped_column.eq(["a", "b", "c", "d", "e"]).all()


def test_dictionary_mapper():
# given
str_column = pd.Series(data=["a", "b", "c", "d", "e"], name="column_str")
dict_mapper = DictionaryMapper(dictionary={"a": 1, "b": 2, "c": 3, "d": 4, "e": 5})

# when
mapped_column = dict_mapper.map(str_column)

# then
assert mapped_column.eq([1, 2, 3, 4, 5]).all()


def test_custom_function_mapper():
# given
str_column = pd.Series(data=["a", "b", "c", "d", "e"], name="column_str")
fn_mapper = FunctionValueMapper(function=lambda x: x + x)

# when
mapped_column = fn_mapper.map(str_column)

# then
assert mapped_column.eq(["aa", "bb", "cc", "dd", "ee"]).all()

0 comments on commit 6db5dbb

Please sign in to comment.