diff --git a/bdikit/functional_api.py b/bdikit/functional_api.py index 44c354ae..e7a76be9 100644 --- a/bdikit/functional_api.py +++ b/bdikit/functional_api.py @@ -12,6 +12,7 @@ JaccardDistanceAlgorithm, GPTAlgorithm, ) +from bdikit.mapping_algorithms.value_mapping.value_mappers import ValueMapper from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_api import ( ContrastiveLearningAPI, ) @@ -113,3 +114,25 @@ def top_matches( dfs.append(matches.sort_values(by="similarity", ascending=False)) return pd.concat(dfs, ignore_index=True) + + +def materialize_mapping( + input_dataframe: pd.DataFrame, target: List[dict] +) -> pd.DataFrame: + output_dataframe = pd.DataFrame() + for mapping_spec in target: + from_column_name = mapping_spec["from"] + to_column_name = mapping_spec["to"] + value_mapper = mapping_spec["mapper"] + output_dataframe[to_column_name] = map_column_values( + input_dataframe[from_column_name], to_column_name, value_mapper + ) + return output_dataframe + + +def map_column_values( + input_column: pd.Series, target: str, value_mapper: ValueMapper +) -> pd.Series: + new_column = value_mapper.map(input_column) + new_column.name = target + return new_column diff --git a/bdikit/mapping_algorithms/value_mapping/value_mappers.py b/bdikit/mapping_algorithms/value_mapping/value_mappers.py new file mode 100644 index 00000000..4c36f4ad --- /dev/null +++ b/bdikit/mapping_algorithms/value_mapping/value_mappers.py @@ -0,0 +1,62 @@ +import pandas as pd + + +class ValueMapper: + """ + A ValueMapper represents objects that transform the values in a input + column to the values from a new output column. + """ + + def map(self, input_column: pd.Series) -> pd.Series: + """ + Every concrete ValueMapper should implement this method, which takes a + pandas Series as input and returns a new pandas Series with transformed + values. + """ + pass + + +class IdentityValueMapper(ValueMapper): + """ + A column mapper that maps each value in input column into itself. + """ + + def map(self, input_column: pd.Series) -> pd.Series: + """ + Simply copies the values in input_column to the output column. + """ + return input_column.copy() + + +class FunctionValueMapper(ValueMapper): + """ + A column mapper that transforms each value in the input column using the + provided custom function. + """ + + def __init__(self, function): + self.function = function + + def map(self, input_column: pd.Series) -> pd.Series: + """ + Applies the given function to each value in input_column to generate + the output column. + """ + return input_column.map(self.function) + + +class DictionaryMapper(ValueMapper): + """ + A column mapper that transforms each value in the input column using the + values stored in the provided dictionary. + """ + + def __init__(self, dictionary: dict): + self.dictionary = dictionary + + def map(self, input_column: pd.Series) -> pd.Series: + """ + Transforms the values in the input_column to the values specified in + the dictionary provided using the object constructor. + """ + return input_column.map(self.dictionary) diff --git a/tests/test_api.py b/tests/test_api.py index ff641a33..39248a1c 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,5 +1,9 @@ import bdikit as bdi import pandas as pd +from bdikit.mapping_algorithms.value_mapping.value_mappers import ( + FunctionValueMapper, + IdentityValueMapper, +) def test_bdi_match_columns_with_dataframes(): @@ -124,3 +128,56 @@ def test_bdi_top_matches_gdc(): assert len(df_matches[df_filter]) == 5 assert "ethnicity" in df_matches[df_filter]["matches"].tolist() assert "race" in df_matches[df_filter]["matches"].tolist() + + +def test_map_column_values(): + """ + Ensures that the map_column_values function correctly maps the values of + a column and assings the target name. + """ + # given + str_column = pd.Series(data=["a", "b", "c", "d", "e"], name="column_str") + value_mapper = FunctionValueMapper(function=lambda x: x.upper()) + target_column_name = "string column" + + # when + mapped_column = bdi.map_column_values( + str_column, target=target_column_name, value_mapper=value_mapper + ) + + # then + upper_cased_values = ["A", "B", "C", "D", "E"] + assert mapped_column.name == target_column_name + assert mapped_column.eq(upper_cased_values).all() + + +def test_map_dataframe_column_values(): + # given + str_column_1 = ["a", "b", "c", "d", "e"] + str_column_2 = ["a", "b", "c", "d", "e"] + df_base = pd.DataFrame({"column_str_1": str_column_1, "column_str_2": str_column_2}) + + value_mapping_spec = [ + { + "from": "column_str_1", + "to": "string column 1", + "mapper": IdentityValueMapper(), + }, + { + "from": "column_str_2", + "to": "string column 2", + "mapper": FunctionValueMapper(function=lambda x: x.upper()), + }, + ] + + # when + df_mapped = bdi.materialize_mapping(df_base, target=value_mapping_spec) + + # then + assert len(df_mapped.columns) == 2 + + assert "string column 1" in df_mapped.columns + assert df_mapped["string column 1"].eq(str_column_1).all() + + assert "string column 2" in df_mapped.columns + assert df_mapped["string column 2"].eq(["A", "B", "C", "D", "E"]).all() diff --git a/tests/test_value_mapping.py b/tests/test_value_mapping.py new file mode 100644 index 00000000..396a6949 --- /dev/null +++ b/tests/test_value_mapping.py @@ -0,0 +1,42 @@ +import pandas as pd +from bdikit.mapping_algorithms.value_mapping import ( + FunctionValueMapper, + DictionaryMapper, + IdentityValueMapper, +) + + +def test_identity_mapper(): + # given + str_column = pd.Series(data=["a", "b", "c", "d", "e"], name="column_str") + identity_mapper = IdentityValueMapper() + + # when + mapped_column = identity_mapper.map(str_column) + + # then + assert mapped_column.eq(["a", "b", "c", "d", "e"]).all() + + +def test_dictionary_mapper(): + # given + str_column = pd.Series(data=["a", "b", "c", "d", "e"], name="column_str") + dict_mapper = DictionaryMapper(dictionary={"a": 1, "b": 2, "c": 3, "d": 4, "e": 5}) + + # when + mapped_column = dict_mapper.map(str_column) + + # then + assert mapped_column.eq([1, 2, 3, 4, 5]).all() + + +def test_custom_function_mapper(): + # given + str_column = pd.Series(data=["a", "b", "c", "d", "e"], name="column_str") + fn_mapper = FunctionValueMapper(function=lambda x: x + x) + + # when + mapped_column = fn_mapper.map(str_column) + + # then + assert mapped_column.eq(["aa", "bb", "cc", "dd", "ee"]).all()