From 971575286f51cb514b73f97fb44ae278ef3e6397 Mon Sep 17 00:00:00 2001 From: Roque Lopez Date: Mon, 13 Jan 2025 17:26:53 -0500 Subject: [PATCH 1/3] feat: Add Magneto --- bdikit/download.py | 1 + .../one2one/matcher_factory.py | 19 +++ bdikit/schema_matching/topk/magneto.py | 132 ++++++++++++++++++ .../schema_matching/topk/matcher_factory.py | 20 +++ requirements.txt | 3 +- 5 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 bdikit/schema_matching/topk/magneto.py diff --git a/bdikit/download.py b/bdikit/download.py index 9bcc5d8..d677b9d 100644 --- a/bdikit/download.py +++ b/bdikit/download.py @@ -15,6 +15,7 @@ BUILTIN_MODELS_BOX_URL = { "cl-reducer-v0.1": "https://nyu.box.com/shared/static/hc4qxzbuxz0uoynfwy4pe2yxo5ch6xgm.pt", "bdi-cl-v0.2": "https://nyu.box.com/shared/static/1vdc28kzbpoj6ey95bksaww541p9gj31.pt", + "magneto-gdc-v0.1": "https://nyu.box.com/shared/static/140g2rq1izc1wqs1ssrml6jzag3qa0mu.pth", } BDIKIT_EMBEDDINGS_CACHE_DIR = os.path.join(BDIKIT_CACHE_DIR, "embeddings") diff --git a/bdikit/schema_matching/one2one/matcher_factory.py b/bdikit/schema_matching/one2one/matcher_factory.py index 30e44d6..df29853 100644 --- a/bdikit/schema_matching/one2one/matcher_factory.py +++ b/bdikit/schema_matching/one2one/matcher_factory.py @@ -38,6 +38,25 @@ class SchemaMatchers(Enum): "max_val_sim", "bdikit.schema_matching.one2one.maxvalsim.MaxValSimSchemaMatcher", ) + MAGNETO = ( + "magneto", + "bdikit.schema_matching.topk.magneto.Magneto", + ) + + MAGNETO_FT = ( + "magneto_ft", + "bdikit.schema_matching.topk.magneto.MagnetoFT", + ) + + MAGNETO_GPT = ( + "magneto_gpt", + "bdikit.schema_matching.topk.magneto.MagnetoGPT", + ) + + MAGNETO_FTGPT = ( + "magneto_ftgpt", + "bdikit.schema_matching.topk.magneto.MagnetoFTGPT", + ) def __init__(self, matcher_name: str, matcher_path: str): self.matcher_name = matcher_name diff --git a/bdikit/schema_matching/topk/magneto.py b/bdikit/schema_matching/topk/magneto.py new file mode 100644 index 0000000..8b59284 --- /dev/null +++ b/bdikit/schema_matching/topk/magneto.py @@ -0,0 +1,132 @@ +import pandas as pd +from typing import Dict, Any, List +from magneto import Magneto as Magneto_Lib +from bdikit.schema_matching.one2one.base import BaseSchemaMatcher +from bdikit.download import get_cached_model_or_download +from bdikit.schema_matching.topk.base import ( + ColumnScore, + TopkMatching, + BaseTopkSchemaMatcher, +) + +DEFAULT_MAGNETO_MODEL = "magneto-gdc-v0.1" + + +class MagnetoBase(BaseSchemaMatcher, BaseTopkSchemaMatcher): + def __init__(self, kwargs: Dict[str, Any] = None): + if kwargs is None: + kwargs = {} + self.magneto = Magneto_Lib(**kwargs) + + def map( + self, + source: pd.DataFrame, + target: pd.DataFrame, + ): + # There is an issue in Magneto to get the top-1 match, so get top 2 and then filter + self.magneto.params["topk"] = 2 # Magneto does not provide a method to set topk + raw_matches = self.magneto.get_matches(source, target) + print("raw_matches", raw_matches) + + # Organizing data into the desired structure + sorted_dict = {} + for (source, target), score in raw_matches.items(): + source_column = source[1] + target_column = target[1] + if source_column not in sorted_dict: + sorted_dict[source_column] = [] + sorted_dict[source_column].append((target_column, score)) + + # Sorting the lists by value in descending order and get top 1 + formatted_matches = {} + for key in sorted_dict: + sorted_matches = sorted(sorted_dict[key], key=lambda x: x[1], reverse=True) + formatted_matches[key] = sorted_matches[0][0] + + return formatted_matches + + def get_recommendations( + self, source: pd.DataFrame, target: pd.DataFrame, top_k: int + ) -> List[TopkMatching]: + self.magneto.params["topk"] = ( + top_k # Magneto does not provide a method to set topk + ) + raw_matches = self.magneto.get_matches(source, target) + + # Organizing data into the desired structure + sorted_dict = {} + for (source, target), score in raw_matches.items(): + source_column = source[1] + target_column = target[1] + if source_column not in sorted_dict: + sorted_dict[source_column] = [] + sorted_dict[source_column].append((target_column, score)) + + # Sorting the lists by value in descending order and format top k + top_k_results = [] + for key in sorted_dict: + sorted_matches = sorted(sorted_dict[key], key=lambda x: x[1], reverse=True) + top_k_columns = [ColumnScore(name, score) for name, score in sorted_matches] + top_k_results.append( + { + "source_column": [key] * len(top_k_columns), + "top_k_columns": top_k_columns, + } + ) + + return top_k_results + + +class Magneto(MagnetoBase): + def __init__(self): + super().__init__() + + +class MagnetoFT(MagnetoBase): + def __init__( + self, + encoding_mode: str = "header_values_verbose", + model_name: str = DEFAULT_MAGNETO_MODEL, + model_path: str = None, + ): + embedding_model = check_magneto_model(model_name, model_path) + kwargs = {"encoding_mode": encoding_mode, "embedding_model": embedding_model} + super().__init__(kwargs) + + +class MagnetoGPT(MagnetoBase): + def __init__(self): + kwargs = {"use_bp_reranker": False, "use_gpt_reranker": True} + super().__init__(kwargs) + + +class MagnetoFTGPT(MagnetoBase): + def __init__( + self, + encoding_mode: str = "header_values_verbose", + model_name: str = DEFAULT_MAGNETO_MODEL, + model_path: str = None, + ): + embedding_model = check_magneto_model(model_name, model_path) + kwargs = { + "encoding_mode": encoding_mode, + "embedding_model": embedding_model, + "use_bp_reranker": False, + "use_gpt_reranker": True, + } + super().__init__(kwargs) + + +def check_magneto_model(model_name: str, model_path: str): + if model_name and model_path: + raise ValueError( + "Only one of model_name or model_path should be provided " + "(they are mutually exclusive)" + ) + + if model_path: + return model_path + elif model_name: + return get_cached_model_or_download(model_name) + else: + raise ValueError("Either model_name or model_path must be provided") diff --git a/bdikit/schema_matching/topk/matcher_factory.py b/bdikit/schema_matching/topk/matcher_factory.py index e256787..1d8caf0 100644 --- a/bdikit/schema_matching/topk/matcher_factory.py +++ b/bdikit/schema_matching/topk/matcher_factory.py @@ -10,6 +10,26 @@ class TopkMatchers(Enum): "bdikit.schema_matching.topk.contrastivelearning.CLTopkSchemaMatcher", ) + MAGNETO = ( + "magneto", + "bdikit.schema_matching.topk.magneto.Magneto", + ) + + MAGNETO_FT = ( + "magneto_ft", + "bdikit.schema_matching.topk.magneto.MagnetoFT", + ) + + MAGNETO_GPT = ( + "magneto_gpt", + "bdikit.schema_matching.topk.magneto.MagnetoGPT", + ) + + MAGNETO_FTGPT = ( + "magneto_ftgpt", + "bdikit.schema_matching.topk.magneto.MagnetoFTGPT", + ) + def __init__(self, matcher_name: str, matcher_path: str): self.matcher_name = matcher_name self.matcher_path = matcher_path diff --git a/requirements.txt b/requirements.txt index c41ffba..2408f2d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ requests scipy<1.13 matplotlib<3.9 panel!=1.4.3 -nltk>=3.9.1 \ No newline at end of file +nltk>=3.9.1 +magneto-python \ No newline at end of file From 8f052ef7bfcc496f481a7aeb7a7e005c470f449e Mon Sep 17 00:00:00 2001 From: Roque Lopez Date: Wed, 15 Jan 2025 11:47:10 -0500 Subject: [PATCH 2/3] refactor: Add names according to the paper --- bdikit/schema_matching/one2one/matcher_factory.py | 8 ++++---- bdikit/schema_matching/topk/magneto.py | 1 - bdikit/schema_matching/topk/matcher_factory.py | 8 ++++---- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/bdikit/schema_matching/one2one/matcher_factory.py b/bdikit/schema_matching/one2one/matcher_factory.py index df29853..c71f4bc 100644 --- a/bdikit/schema_matching/one2one/matcher_factory.py +++ b/bdikit/schema_matching/one2one/matcher_factory.py @@ -39,22 +39,22 @@ class SchemaMatchers(Enum): "bdikit.schema_matching.one2one.maxvalsim.MaxValSimSchemaMatcher", ) MAGNETO = ( - "magneto", + "magneto_zs_bp", "bdikit.schema_matching.topk.magneto.Magneto", ) MAGNETO_FT = ( - "magneto_ft", + "magneto_ft_bp", "bdikit.schema_matching.topk.magneto.MagnetoFT", ) MAGNETO_GPT = ( - "magneto_gpt", + "magneto_zs_llm", "bdikit.schema_matching.topk.magneto.MagnetoGPT", ) MAGNETO_FTGPT = ( - "magneto_ftgpt", + "magneto_ft_llm", "bdikit.schema_matching.topk.magneto.MagnetoFTGPT", ) diff --git a/bdikit/schema_matching/topk/magneto.py b/bdikit/schema_matching/topk/magneto.py index 8b59284..5f74664 100644 --- a/bdikit/schema_matching/topk/magneto.py +++ b/bdikit/schema_matching/topk/magneto.py @@ -26,7 +26,6 @@ def map( # There is an issue in Magneto to get the top-1 match, so get top 2 and then filter self.magneto.params["topk"] = 2 # Magneto does not provide a method to set topk raw_matches = self.magneto.get_matches(source, target) - print("raw_matches", raw_matches) # Organizing data into the desired structure sorted_dict = {} diff --git a/bdikit/schema_matching/topk/matcher_factory.py b/bdikit/schema_matching/topk/matcher_factory.py index 1d8caf0..707b04c 100644 --- a/bdikit/schema_matching/topk/matcher_factory.py +++ b/bdikit/schema_matching/topk/matcher_factory.py @@ -11,22 +11,22 @@ class TopkMatchers(Enum): ) MAGNETO = ( - "magneto", + "magneto_zs_bp", "bdikit.schema_matching.topk.magneto.Magneto", ) MAGNETO_FT = ( - "magneto_ft", + "magneto_ft_bp", "bdikit.schema_matching.topk.magneto.MagnetoFT", ) MAGNETO_GPT = ( - "magneto_gpt", + "magneto_zs_llm", "bdikit.schema_matching.topk.magneto.MagnetoGPT", ) MAGNETO_FTGPT = ( - "magneto_ftgpt", + "magneto_ft_llm", "bdikit.schema_matching.topk.magneto.MagnetoFTGPT", ) From 56b6441a13c8a8cc5ccc1e1644184efcc79d0e1e Mon Sep 17 00:00:00 2001 From: Roque Lopez Date: Wed, 15 Jan 2025 11:48:01 -0500 Subject: [PATCH 3/3] docs: Add Magneto methods --- docs/source/schema-matching.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/source/schema-matching.rst b/docs/source/schema-matching.rst index 7148252..a673189 100644 --- a/docs/source/schema-matching.rst +++ b/docs/source/schema-matching.rst @@ -15,6 +15,18 @@ To see how to use these methods, please refer to the documentation of :py:func:` * - Method - Class - Description + * - ``magneto_zs_bp`` + - :class:`~bdikit.schema_matching.topk.magneto.Magneto` + - | Uses a zero-shot small language model as retriever with the bipartite algorithm as reranker in Magneto. + * - ``magneto_ft_bp`` + - :class:`~bdikit.schema_matching.topk.magneto.MagnetoFT` + - | Uses a fine-tuned small language model as retriever with the bipartite algorithm as reranker in Magneto. + * - ``magneto_zs_llm`` + - :class:`~bdikit.schema_matching.topk.magneto.MagnetoGPT` + - | Uses a zero-shot small language model as retriever with a large language model as reranker in Magneto. + * - ``magneto_ft_llm`` + - :class:`~bdikit.schema_matching.topk.magneto.MagnetoFTGPT` + - | Uses a fine-tuned small language model as retriever with a large language model as reranker in Magneto. * - ``ct_learning`` - :class:`~bdikit.schema_matching.one2one.contrastivelearning.ContrastiveLearningSchemaMatcher` - | Uses a contrastive (CT) learning model to learn embeddings for columns and retrieves the best match most similar columns using the cosine similarity between the column embeddings.