Skip to content

Commit

Permalink
Removed duplicated code from column mapping algorithms
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed May 28, 2024
1 parent 0013da1 commit 450a4e9
Showing 1 changed file with 26 additions and 45 deletions.
71 changes: 26 additions & 45 deletions bdikit/mapping_algorithms/column_mapping/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,29 @@
import pandas as pd
from valentine import valentine_match
from valentine.algorithms import (
SimilarityFlooding,
Coma,
Cupid,
DistributionBased,
JaccardDistanceMatcher,
BaseMatcher,
)
from valentine.algorithms.matcher_results import MatcherResults
from openai import OpenAI


def match_and_map_tables(
dataset: pd.DataFrame, global_table: pd.DataFrame, matcher: BaseMatcher
) -> dict:
matches: MatcherResults = valentine_match(dataset, global_table, matcher)
mappings = {}
for match in matches.one_to_one():
dataset_candidate = match[0][1]
global_table_candidate = match[1][1]
mappings[dataset_candidate] = global_table_candidate
return mappings


class BaseColumnMappingAlgorithm:
def __init__(self, dataset, global_table):
self._dataset = dataset
Expand All @@ -23,79 +38,45 @@ def __init__(self, dataset, global_table):
super().__init__(dataset, global_table)

def map(self):
matcher = SimilarityFlooding()
matches = valentine_match(self._dataset, self._global_table, matcher)

mappings = {}
for match in matches.one_to_one():
dataset_candidate = match[0][1]
global_table_candidate = match[1][1]
mappings[dataset_candidate] = global_table_candidate
return mappings
return match_and_map_tables(
self._dataset, self._global_table, SimilarityFlooding()
)


class ComaAlgorithm(BaseColumnMappingAlgorithm):
def __init__(self, dataset, global_table):
super().__init__(dataset, global_table)

def map(self):
matcher = Coma()
matches = valentine_match(self._dataset, self._global_table, matcher)

mappings = {}
for match in matches.one_to_one():
dataset_candidate = match[0][1]
global_table_candidate = match[1][1]
mappings[dataset_candidate] = global_table_candidate
return mappings
return match_and_map_tables(self._dataset, self._global_table, Coma())


class CupidAlgorithm(BaseColumnMappingAlgorithm):
def __init__(self, dataset, global_table):
super().__init__(dataset, global_table)

def map(self):
matcher = Cupid()
matches = valentine_match(self._dataset, self._global_table, matcher)

mappings = {}
for match in matches.one_to_one():
dataset_candidate = match[0][1]
global_table_candidate = match[1][1]
mappings[dataset_candidate] = global_table_candidate
return mappings
return match_and_map_tables(self._dataset, self._global_table, Cupid())


class DistributionBasedAlgorithm(BaseColumnMappingAlgorithm):
def __init__(self, dataset, global_table):
super().__init__(dataset, global_table)

def map(self):
matcher = DistributionBased()
matches = valentine_match(self._dataset, self._global_table, matcher)

mappings = {}
for match in matches.one_to_one():
dataset_candidate = match[0][1]
global_table_candidate = match[1][1]
mappings[dataset_candidate] = global_table_candidate
return mappings
return match_and_map_tables(
self._dataset, self._global_table, DistributionBased()
)


class JaccardDistanceAlgorithm(BaseColumnMappingAlgorithm):
def __init__(self, dataset, global_table):
super().__init__(dataset, global_table)

def map(self):
matcher = JaccardDistanceMatcher()
matches = valentine_match(self._dataset, self._global_table, matcher)

mappings = {}
for match in matches.one_to_one():
dataset_candidate = match[0][1]
global_table_candidate = match[1][1]
mappings[dataset_candidate] = global_table_candidate
return mappings
return match_and_map_tables(
self._dataset, self._global_table, JaccardDistanceMatcher()
)


class GPTAlgorithm(BaseColumnMappingAlgorithm):
Expand Down

0 comments on commit 450a4e9

Please sign in to comment.