Removed duplicated code from column mapping algorithms

VIDA-NYU · May 28, 2024 · 450a4e9 · 450a4e9
1 parent 0013da1
commit 450a4e9
Showing 1 changed file with 26 additions and 45 deletions.
diff --git a/bdikit/mapping_algorithms/column_mapping/algorithms.py b/bdikit/mapping_algorithms/column_mapping/algorithms.py
@@ -1,14 +1,29 @@
+import pandas as pd
 from valentine import valentine_match
 from valentine.algorithms import (
     SimilarityFlooding,
     Coma,
     Cupid,
     DistributionBased,
     JaccardDistanceMatcher,
+    BaseMatcher,
 )
+from valentine.algorithms.matcher_results import MatcherResults
 from openai import OpenAI
 
 
+def match_and_map_tables(
+    dataset: pd.DataFrame, global_table: pd.DataFrame, matcher: BaseMatcher
+) -> dict:
+    matches: MatcherResults = valentine_match(dataset, global_table, matcher)
+    mappings = {}
+    for match in matches.one_to_one():
+        dataset_candidate = match[0][1]
+        global_table_candidate = match[1][1]
+        mappings[dataset_candidate] = global_table_candidate
+    return mappings
+
+
 class BaseColumnMappingAlgorithm:
     def __init__(self, dataset, global_table):
         self._dataset = dataset
@@ -23,79 +38,45 @@ def __init__(self, dataset, global_table):
         super().__init__(dataset, global_table)
 
     def map(self):
-        matcher = SimilarityFlooding()
-        matches = valentine_match(self._dataset, self._global_table, matcher)
-
-        mappings = {}
-        for match in matches.one_to_one():
-            dataset_candidate = match[0][1]
-            global_table_candidate = match[1][1]
-            mappings[dataset_candidate] = global_table_candidate
-        return mappings
+        return match_and_map_tables(
+            self._dataset, self._global_table, SimilarityFlooding()
+        )
 
 
 class ComaAlgorithm(BaseColumnMappingAlgorithm):
     def __init__(self, dataset, global_table):
         super().__init__(dataset, global_table)
 
     def map(self):
-        matcher = Coma()
-        matches = valentine_match(self._dataset, self._global_table, matcher)
-
-        mappings = {}
-        for match in matches.one_to_one():
-            dataset_candidate = match[0][1]
-            global_table_candidate = match[1][1]
-            mappings[dataset_candidate] = global_table_candidate
-        return mappings
+        return match_and_map_tables(self._dataset, self._global_table, Coma())
 
 
 class CupidAlgorithm(BaseColumnMappingAlgorithm):
     def __init__(self, dataset, global_table):
         super().__init__(dataset, global_table)
 
     def map(self):
-        matcher = Cupid()
-        matches = valentine_match(self._dataset, self._global_table, matcher)
-
-        mappings = {}
-        for match in matches.one_to_one():
-            dataset_candidate = match[0][1]
-            global_table_candidate = match[1][1]
-            mappings[dataset_candidate] = global_table_candidate
-        return mappings
+        return match_and_map_tables(self._dataset, self._global_table, Cupid())
 
 
 class DistributionBasedAlgorithm(BaseColumnMappingAlgorithm):
     def __init__(self, dataset, global_table):
         super().__init__(dataset, global_table)
 
     def map(self):
-        matcher = DistributionBased()
-        matches = valentine_match(self._dataset, self._global_table, matcher)
-
-        mappings = {}
-        for match in matches.one_to_one():
-            dataset_candidate = match[0][1]
-            global_table_candidate = match[1][1]
-            mappings[dataset_candidate] = global_table_candidate
-        return mappings
+        return match_and_map_tables(
+            self._dataset, self._global_table, DistributionBased()
+        )
 
 
 class JaccardDistanceAlgorithm(BaseColumnMappingAlgorithm):
     def __init__(self, dataset, global_table):
         super().__init__(dataset, global_table)
 
     def map(self):
-        matcher = JaccardDistanceMatcher()
-        matches = valentine_match(self._dataset, self._global_table, matcher)
-
-        mappings = {}
-        for match in matches.one_to_one():
-            dataset_candidate = match[0][1]
-            global_table_candidate = match[1][1]
-            mappings[dataset_candidate] = global_table_candidate
-        return mappings
+        return match_and_map_tables(
+            self._dataset, self._global_table, JaccardDistanceMatcher()
+        )
 
 
 class GPTAlgorithm(BaseColumnMappingAlgorithm):