From 3ba7e22d52320166ec003cbd04c5f09bc0eefe24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 13 Jan 2025 22:11:18 +0100 Subject: [PATCH 01/49] fix: Added C-MTEB (#1786) Added C-MTEB --- mteb/benchmarks/benchmarks.py | 49 +++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index d5c249e00..3478d48b2 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1147,3 +1147,52 @@ def load_results( reference="https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6", citation=None, ) + +C_MTEB = Benchmark( + name="MTEB(Chinese)", + tasks=get_tasks( + tasks=[ + "T2Retrieval", + "MMarcoRetrieval", + "DuRetrieval", + "CovidRetrieval", + "CmedqaRetrieval", + "EcomRetrieval", + "MedicalRetrieval", + "VideoRetrieval", + "T2Reranking", + "MMarcoReranking", + "CMedQAv1-reranking", + "CMedQAv2-reranking", + "Ocnli", + "Cmnli", + "CLSClusteringS2S", + "CLSClusteringP2P", + "ThuNewsClusteringS2S", + "ThuNewsClusteringP2P", + "ATEC", + "BQ", + "LCQMC", + "PAWSX", + "STSB", + "AFQMC", + "QBQTC", + "TNews", + "IFlyTek", + "Waimai", + "OnlineShopping", + "MultilingualSentiment", + "JDReview", + ], + ), + description="The Chinese Massive Text Embedding Benchmark (C-MTEB) is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets.", + reference="https://github.com/FlagOpen/FlagEmbedding/tree/master/research/C_MTEB", + citation="""@misc{c-pack, + title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, + author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff}, + year={2023}, + eprint={2309.07597}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +}""", +) From 48370c7b94be22b98816c8410b2f792c1c499169 Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 13 Jan 2025 21:27:26 +0000 Subject: [PATCH 02/49] 1.29.1 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 52bb15004..bf7b21eed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.0" +version = "1.29.1" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From e9e9118b9bf6cbda678c70d6776a8f290833eff3 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Tue, 14 Jan 2025 12:53:58 +0900 Subject: [PATCH 03/49] docs: Add contact to MMTEB benchmarks (#1796) * Add myself to MMTEB benchmarks * lint --- mteb/benchmarks/benchmarks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 3478d48b2..0537c604f 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -840,7 +840,7 @@ def load_results( description="The Multilingual benchmarks from MMTEB. Currently under development.", reference=None, citation=None, - contacts=["KennethEnevoldsen"], + contacts=["KennethEnevoldsen", "isaac-chung"], ) MTEB_JPN = Benchmark( @@ -952,7 +952,7 @@ def load_results( description="Main Indic benchmark from MMTEB", reference=None, citation=None, - contacts=["KennethEnevoldsen"], + contacts=["KennethEnevoldsen", "isaac-chung"], ) @@ -1084,7 +1084,7 @@ def load_results( description="Main European benchmark from MMTEB", reference=None, citation=None, - contacts=["KennethEnevoldsen"], + contacts=["KennethEnevoldsen", "isaac-chung"], ) LONG_EMBED = Benchmark( From 94103e6a2e8156678c3858045286cbd50b5d49c5 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Tue, 14 Jan 2025 15:44:54 +0500 Subject: [PATCH 04/49] fix: loading pre 11 (#1798) * fix loading pre 11 * add similarity * lint * run all task types --- mteb/load_results/task_results.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index e1b9b9d69..72cae5a93 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -387,15 +387,16 @@ def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult: main_score = task.metadata.main_score for split, split_score in scores.items(): for hf_subset, hf_subset_scores in split_score.items(): - if task.metadata.type == "STS": - for name, prev_name in [ - ("cosine", "cos_sim"), - ("manhattan", "manhattan"), - ("euclidean", "euclidean"), - ]: - prev_name_scores = hf_subset_scores.pop( - prev_name, {"spearman": "NaN"} - ) + for name, prev_name in [ + ("cosine", "cos_sim"), + ("manhattan", "manhattan"), + ("euclidean", "euclidean"), + ("dot", "dot"), + ("max", "max"), + ("similarity", "similarity"), + ]: + prev_name_scores = hf_subset_scores.pop(prev_name, None) + if prev_name_scores is not None: for k, v in prev_name_scores.items(): hf_subset_scores[f"{name}_{k}"] = v From b6fb5b8ca7285ec426e952dfbcb1805935f5cf12 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 14 Jan 2025 10:49:54 +0000 Subject: [PATCH 05/49] 1.29.2 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bf7b21eed..9f1e4deac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.1" +version = "1.29.2" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From a2028840a6b4f77057761664edce8cae2edb64d1 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Tue, 14 Jan 2025 17:46:26 +0500 Subject: [PATCH 06/49] fix: allow to load no revision available (#1801) * fix allow to load no revision available * lint * add require_model_meta to leaderboard * lint --- mteb/leaderboard/app.py | 4 +- mteb/load_results/load_results.py | 1 + pyproject.toml | 2 +- scripts/compare_leaderboard_results.py | 90 +++++++++++++++----------- 4 files changed, 57 insertions(+), 40 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index d1383cf1a..e3c7d0aad 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -24,7 +24,9 @@ def load_results(): results_cache_path = Path(__file__).parent.joinpath("__cached_results.json") if not results_cache_path.exists(): all_results = ( - mteb.load_results(only_main_score=True).join_revisions().filter_models() + mteb.load_results(only_main_score=True, require_model_meta=False) + .join_revisions() + .filter_models() ) all_results.to_disk(results_cache_path) return all_results diff --git a/mteb/load_results/load_results.py b/mteb/load_results/load_results.py index 03ec6fb30..ef851a1dc 100644 --- a/mteb/load_results/load_results.py +++ b/mteb/load_results/load_results.py @@ -139,6 +139,7 @@ def load_results( continue model_name, revision = model_name_and_revision + model_name = model_name.replace("__", "/") if models_to_keep is not None and model_name not in models_to_keep: continue elif models_to_keep is not None and models_to_keep[model_name] is not None: diff --git a/pyproject.toml b/pyproject.toml index 9f1e4deac..0f96f554d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,7 @@ dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint codecarbon = ["codecarbon"] speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] peft = ["peft>=0.11.0"] -leaderboard = ["gradio>=5.7.1", "gradio_rangeslider>=0.0.8"] +leaderboard = ["gradio>=5.7.1", "gradio_rangeslider>=0.0.8", "plotly>=5.24.0"] flagembedding = ["FlagEmbedding"] jina = ["einops>=0.8.0"] flash_attention = ["flash-attn>=2.6.3"] diff --git a/scripts/compare_leaderboard_results.py b/scripts/compare_leaderboard_results.py index bbeb912bb..1fe9c3d76 100644 --- a/scripts/compare_leaderboard_results.py +++ b/scripts/compare_leaderboard_results.py @@ -2,70 +2,84 @@ import json import logging +from collections import defaultdict from pathlib import Path -from mteb import MTEB_ENG_CLASSIC, load_results +from mteb import get_benchmark, load_results logging.basicConfig(level=logging.INFO) models = [ - "dunzhang/stella_en_1.5B_v5", - "dunzhang/stella_en_400M_v5", + "intfloat/multilingual-e5-small", # Add other models here ] +benchmark = get_benchmark("MTEB(Chinese)") + +results = [] # in same folder as mteb repo # git clone https://github.com/embeddings-benchmark/leaderboard -data_tasks_path = Path("../../leaderboard/boards_data/en/data_tasks/") +# get path of current file +base_path = Path(__file__).parent.parent.parent / "leaderboard" / "boards_data" -results = [] for model_name_to_search in models: model_results = load_results( models=[model_name_to_search], - tasks=MTEB_ENG_CLASSIC.tasks, + tasks=benchmark.tasks, only_main_score=True, + require_model_meta=False, ) - cur_model = {} + cur_model = {task.metadata.name: defaultdict(dict) for task in benchmark.tasks} for model_res in model_results: for task_res in model_res.task_results: task_name = task_res.task.metadata.name - split = "test" if task_name != "MSMARCO" else "dev" - scores = [score["main_score"] for score in task_res.scores[split]] - # this tmp solution, because some tasks have multiple results - cur_model[task_name] = {"new": round((sum(scores) / len(scores)) * 100, 2)} - for task_dir in data_tasks_path.iterdir(): - if task_dir.is_dir(): - results_file_path = task_dir / "default.jsonl" - if results_file_path.exists(): - with open(results_file_path) as file: - for line in file: - data = json.loads(line) - model_name = data.get("Model", "") - if model_name_to_search in model_name: - for key, value in data.items(): - if key in [ - "index", - "Rank", - "Model", - "Model Size (Million Parameters)", - "Memory Usage (GB, fp32)", - "Embedding Dimensions", - "Max Tokens", - "Average", - ]: - continue - for benchmark_task in MTEB_ENG_CLASSIC.tasks: - if benchmark_task.metadata.name in key: - cur_model[benchmark_task.metadata.name][ - "old" - ] = value + split = ( + "test" + if "test" in task_res.task.metadata.eval_splits + else task_res.task.metadata.eval_splits[0] + ) + if split in task_res.scores: + scores = [score["main_score"] for score in task_res.scores[split]] + cur_model[task_name]["new"] = round( + (sum(scores) / len(scores)) * 100, 2 + ) + + for lang_path in base_path.iterdir(): + data_tasks_path = lang_path / "data_tasks" + + for task_dir in data_tasks_path.iterdir(): + if task_dir.is_dir(): + results_file_path = task_dir / "default.jsonl" + if results_file_path.exists(): + with open(results_file_path) as file: + for line in file: + data = json.loads(line) + model_name = data.get("Model", "") + if model_name_to_search in model_name: + for key, value in data.items(): + if key in [ + "index", + "Rank", + "Model", + "Model Size (Million Parameters)", + "Memory Usage (GB, fp32)", + "Embedding Dimensions", + "Max Tokens", + "Average", + ]: + continue + for benchmark_task in benchmark.tasks: + if benchmark_task.metadata.name in key: + cur_model[benchmark_task.metadata.name][ + "old" + ] = value sorted_cur_model = { task.metadata.name: cur_model[task.metadata.name] - for task in MTEB_ENG_CLASSIC.tasks + for task in benchmark.tasks if task.metadata.name in cur_model } results.append({"model": model_name_to_search, "results": sorted_cur_model}) From bcb2cd97c8afb80e11d636ea34689bd08f922b19 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 14 Jan 2025 13:03:43 +0000 Subject: [PATCH 07/49] 1.29.3 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0f96f554d..441332dd7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.2" +version = "1.29.3" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 0acc166d54294ce16dc4750a84ad4abd896ab92d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 15 Jan 2025 10:25:21 +0100 Subject: [PATCH 08/49] fix: Zero shot and aggregation on Leaderboard (#1810) * Made join_revision filter out no_revision_available when other revisions have been run on the task * Fixed zero-shot filtering * Fixed aggregation of task types * Ran linting --- mteb/leaderboard/app.py | 14 +++++++------- mteb/leaderboard/table.py | 2 +- mteb/load_results/benchmark_results.py | 12 ++++++++++-- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index e3c7d0aad..cb806e467 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -412,7 +412,7 @@ def filter_models( compatibility, instructions, model_size, - zero_shot, + zero_shot_setting, ): lower, upper = model_size # Setting to None, when the user doesn't specify anything @@ -432,12 +432,12 @@ def filter_models( tasks = mteb.get_tasks(tasks=task_select) models_to_keep = set() for model_meta in model_metas: - is_zero_shot = model_meta.is_zero_shot_on(tasks) - if is_zero_shot is None: - if zero_shot == "hard": + is_model_zero_shot = model_meta.is_zero_shot_on(tasks) + if is_model_zero_shot is None: + if zero_shot_setting == "hard": continue - if not zero_shot: - if zero_shot != "off": + elif not is_model_zero_shot: + if zero_shot_setting != "off": continue models_to_keep.add(model_meta.name) return list(models_to_keep) @@ -460,7 +460,7 @@ def update_models( compatibility, instructions, model_size, - zero_shot, + zero_shot_setting=zero_shot, ) elapsed = time.time() - start_time logger.info(f"update_models callback: {elapsed}s") diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 9a1dc5799..041df4709 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -88,7 +88,7 @@ def get_means_per_types(per_task: pd.DataFrame): dict( model_name=model_name, task_type=task_type, - score=scores[tasks].mean(), + score=scores[tasks].mean(skipna=False), ) ) return pd.DataFrame.from_records(records) diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index 015a96d33..e1632a3de 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -260,8 +260,16 @@ def parse_version(version_str: str) -> Version | None: def keep_best(group: pd.DataFrame) -> pd.DataFrame: is_main_revision = group["revision"] == group["main_revision"] - if is_main_revision.sum() == 1: - return group[is_main_revision] + # If the main revision is present we select that + if is_main_revision.sum() > 0: + return group[is_main_revision].head(n=1) + unique_revisions = group["revision"].unique() + # Filtering out no_revision_available if other revisions are present + if (len(unique_revisions) > 1) and ( + "no_revision_available" in unique_revisions + ): + group = group[group["revision"] != "no_revision_available"] + # If there are any not-NA mteb versions, we select the latest one if group["mteb_version"].notna().any(): group = group.dropna(subset=["mteb_version"]) group = group.sort_values("mteb_version", ascending=False) From 3f5ee82a5049eaf235a84fcfc9278f48adecfcb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 15 Jan 2025 10:43:16 +0100 Subject: [PATCH 09/49] fix: Added `ModelMeta` for BGE, GTE Chinese and multilingual models (#1811) * Added BGE Chinese and multilingual-gemma models * Added GTE multilingual and Chinese models * Fixed date format --- mteb/models/bge_models.py | 195 ++++++++++++++++++++++++++++++++++++++ mteb/models/gte_models.py | 195 +++++++++++++++++++++++++++++++++++++- 2 files changed, 389 insertions(+), 1 deletion(-) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 276d28526..23851b498 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -5,6 +5,89 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader model_prompts = {"query": "Represent this sentence for searching relevant passages: "} +model_prompts_zh = {"query": "为这个句子生成表示以用于检索相关文章:"} + +bge_m_training_data = { + # source: https://arxiv.org/pdf/2402.03216 + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "LeCaRDv2": ["train"], + "CMedQAv1-reranking": ["train"], + "CMedQAv2-reranking": ["train"], + "MrTidyRetrieval": ["train"], + "T2Reranking": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + # + synthetic data +} + +bge_training_data = { + # source: https://data.baai.ac.cn/details/BAAI-MTP + "NQ": ["test"], + "NQHardNegatives": ["test"], + "AmazonReviewsClassification": [ + "validation", + "test", + ], # assumed from: amazon_reviews_multi + "MLQARetrieval": [ + "validation", + "test", + ], # assumed from mlqa (question, context) + # not in mteb + # Dataset Pairs + # wudao (title, passage) + # cmrc2018 (query, context) + # dureader (query, context) + # simclue (sentence_a, sentence_b) + # csl (title, abstract) + # amazon_reviews_multi (title, body) + # wiki_atomic_edits (base_sentence, edited_sentence) + # mlqa (question, context) + # xlsum (title, summary) (title, text) + # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further + # "wikipedia": [], # title + section title, passage + # "reddit": [], # title, body + # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) + # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) +} + +bge_chinese_training_data = { + # source: https://arxiv.org/pdf/2309.07597 + "T2Retrieval": ["train"], + "DuReader": ["train"], + "MMarcoReranking": ["train"], + "CMedQAv2-reranking": ["train"], + "Cmnli": ["train"], + "Ocnli": ["train"], + # not in mteb + # - multi-cpr + # - NLI-zh + # Dataset Pairs + # wudao (title, passage) + # cmrc2018 (query, context) + # dureader (query, context) + # simclue (sentence_a, sentence_b) + # csl (title, abstract) + # amazon_reviews_multi (title, body) + # wiki_atomic_edits (base_sentence, edited_sentence) + # mlqa (question, context) + # xlsum (title, summary) (title, text) + # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further + # "wikipedia": [], # title + section title, passage + # "reddit": [], # title, body + # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) + # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) +} bge_small_en_v1_5 = ModelMeta( loader=partial( # type: ignore @@ -167,3 +250,115 @@ # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) }, ) + +bge_small_zh_v1_5 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-small-zh-v1.5", + revision="7999e1d3359715c523056ef9478215996d62a620", + model_prompts=model_prompts_zh, + ), + name="BAAI/bge-small-zh-v1.5", + languages=["zho_Hans"], + open_weights=True, + revision="7999e1d3359715c523056ef9478215996d62a620", + release_date="2023-09-12", # initial commit of hf model. + n_parameters=24_000_000, + memory_usage=None, + embed_dim=512, + license="mit", + max_tokens=512, + reference="https://huggingface.co/BAAI/bge-small-zh-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP + public_training_code=None, # seemingly released (at least for some models, but the link is broken + training_datasets=bge_chinese_training_data, +) + +bge_base_zh_v1_5 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-base-zh-v1.5", + revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65", + model_prompts=model_prompts_zh, + ), + name="BAAI/bge-base-zh-v1.5", + languages=["zho_Hans"], + open_weights=True, + revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65", + release_date="2023-09-11", # initial commit of hf model. + n_parameters=438_000_000, + memory_usage=None, + embed_dim=768, + license="mit", + max_tokens=512, + reference="https://huggingface.co/BAAI/bge-base-zh-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP + public_training_code=None, # seemingly released (at least for some models, but the link is broken + training_datasets=bge_chinese_training_data, +) + +bge_large_zh_v1_5 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-large-zh-v1.5", + revision="79e7739b6ab944e86d6171e44d24c997fc1e0116", + model_prompts=model_prompts_zh, + ), + name="BAAI/bge-large-zh-v1.5", + languages=["zho_Hans"], + open_weights=True, + revision="79e7739b6ab944e86d6171e44d24c997fc1e0116", + release_date="2023-09-12", # initial commit of hf model. + n_parameters=1_340_000_000, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/BAAI/bge-large-zh-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP + public_training_code=None, # seemingly released (at least for some models, but the link is broken + training_datasets=bge_chinese_training_data, +) + +bge_multilingual_gemma2 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-multilingual-gemma2", + revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a", + ), + name="BAAI/bge-multilingual-gemma2", + languages=[ + "eng_Latn", + "zho_Hans", + "kor_Hang", + "kor_Latn", + "fra_Latn", + "jpn_Jpan", + "jpn_Latn", + ], # This list is incomlete. Their description says "and more". + # I'm also unsure about the scripts. + open_weights=True, + revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a", + release_date="2024-07-25", # initial commit of hf model. + n_parameters=9.24 * 1e9, + memory_usage=None, + embed_dim=3584, # from old C-MTEB leaderboard + license="gemma", + max_tokens=8192, # from old C-MTEB leaderboard + reference="https://huggingface.co/BAAI/bge-multilingual-gemma2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=False, + public_training_code=False, + training_datasets=None, # not disclosed +) diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index 648fc1885..f800aaa94 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -5,7 +5,7 @@ import torch from mteb.encoder_interface import PromptType -from mteb.model_meta import ModelMeta +from mteb.model_meta import ModelMeta, sentence_transformers_loader from mteb.models.instruct_wrapper import instruct_wrapper @@ -105,3 +105,196 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, ) + +gte_small_zh = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="thenlper/gte-small-zh", + revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a", + ), + name="thenlper/gte-small-zh", + languages=["zho_Hans"], + open_weights=True, + revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a", + release_date="2023-11-08", # initial commit of hf model. + n_parameters=30.3 * 1e6, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/thenlper/gte-small-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=False, + public_training_code=None, + training_datasets=None, # Not disclosed +) + +gte_base_zh = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="thenlper/gte-base-zh", + revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c", + ), + name="thenlper/gte-base-zh", + languages=["zho_Hans"], + open_weights=True, + revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c", + release_date="2023-11-08", # initial commit of hf model. + n_parameters=102 * 1e6, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/thenlper/gte-base-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=False, + public_training_code=None, + training_datasets=None, # Not disclosed +) + +gte_large_zh = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="thenlper/gte-large-zh", + revision="64c364e579de308104a9b2c170ca009502f4f545", + ), + name="thenlper/gte-large-zh", + languages=["zho_Hans"], + open_weights=True, + revision="64c364e579de308104a9b2c170ca009502f4f545", + release_date="2023-11-08", # initial commit of hf model. + n_parameters=326 * 1e6, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/thenlper/gte-large-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=False, + public_training_code=None, + training_datasets=None, # Not disclosed +) + +gte_multilingual_langs = [ + "afr_Latn", + "ara_Arab", + "aze_Latn", + "bel_Cyrl", + "bul_Cyrl", + "ben_Beng", + "cat_Latn", + "ceb_Latn", + "ces_Latn", + "cym_Latn", + "dan_Latn", + "deu_Latn", + "ell_Grek", + "eng_Latn", + "spa_Latn", + "est_Latn", + "eus_Latn", + "fas_Arab", + "fin_Latn", + "fra_Latn", + "glg_Latn", + "guj_Gujr", + "heb_Hebr", + "hin_Deva", + "hrv_Latn", + "hat_Latn", + "hun_Latn", + "hye_Armn", + "ind_Latn", + "isl_Latn", + "ita_Latn", + "jpn_Jpan", + "jav_Latn", + "kat_Geor", + "kaz_Cyrl", + "khm_Khmr", + "kan_Knda", + "kor_Hang", + "kir_Cyrl", + "lao_Laoo", + "lit_Latn", + "lav_Latn", + "mkd_Cyrl", + "mal_Mlym", + "mon_Cyrl", + "mar_Deva", + "msa_Latn", + "mya_Mymr", + "nep_Deva", + "nld_Latn", + "nor_Latn", + "pan_Guru", + "pol_Latn", + "por_Latn", + "que_Latn", + "ron_Latn", + "rus_Cyrl", + "sin_Sinh", + "slk_Latn", + "slv_Latn", + "swa_Latn", + "tam_Taml", + "tel_Telu", + "tha_Thai", + "tgl_Latn", + "tur_Latn", + "ukr_Cyrl", + "urd_Arab", + "vie_Latn", + "yor_Latn", + "zho_Hans", +] +# Source: https://arxiv.org/pdf/2407.19669 +gte_multi_training_data = { + "T2Retrieval": ["train"], + "DuReader": ["train"], + "MMarcoReranking": ["train"], + "CMedQAv2-reranking": ["train"], + "NQ": ["train"], + "MSMARCO": ["train"], + "HotpotQA": ["train"], + "FEVER": ["train"], + "MIRACLReranking": ["train"], + "MrTidyRetrieval": ["train"], + "MultiLongDocRetrieval": ["train"], + # not in MTEB: + # - TriviaQA + # - SQuAD + # - AllNLI + # - Multi-CPR +} + +gte_multilingual_base = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="Alibaba-NLP/gte-multilingual-base", + revision="ca1791e0bcc104f6db161f27de1340241b13c5a4", + ), + name="Alibaba-NLP/gte-multilingual-base", + languages=gte_multilingual_langs, + open_weights=True, + revision="ca1791e0bcc104f6db161f27de1340241b13c5a4", + release_date="2024-07-20", # initial commit of hf model. + n_parameters=305 * 1e6, + memory_usage=None, + embed_dim=1024, + license="apache-2", + max_tokens=8192, + reference="https://huggingface.co/Alibaba-NLP/gte-multilingual-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=True, + public_training_code=None, # couldn't find + training_datasets=gte_multi_training_data, +) From 217dabe0c947778a656cffb578acd736148f1aa1 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 15 Jan 2025 09:58:54 +0000 Subject: [PATCH 10/49] 1.29.4 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 441332dd7..153a22088 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.3" +version = "1.29.4" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From c4ee9fe1ccffaea57b8bf21d42e4031386a95c01 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Wed, 15 Jan 2025 15:06:13 +0100 Subject: [PATCH 11/49] fix: Add additional contacts (#1817) add contacts from #1790 --- mteb/benchmarks/benchmarks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 0537c604f..7d8aedc9d 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -592,6 +592,7 @@ def load_results( primaryClass={cs.CL}, url={https://arxiv.org/abs/2401.02709}, }""", + contacts=["slvnwhrl"], ) @@ -657,6 +658,7 @@ def load_results( journal={arXiv preprint arXiv:2405.10138}, year={2024} }""", + contacts=["rafalposwiata"], ) MTEB_code = Benchmark( From e3a3df89b5749924bb45986460d25ec58f7f24e8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 15 Jan 2025 14:09:00 +0000 Subject: [PATCH 12/49] Update points table --- docs/mmteb/points_table.md | 200 ++++++++++++++++++------------------- 1 file changed, 100 insertions(+), 100 deletions(-) diff --git a/docs/mmteb/points_table.md b/docs/mmteb/points_table.md index 37f7c7725..cd166890d 100644 --- a/docs/mmteb/points_table.md +++ b/docs/mmteb/points_table.md @@ -2,103 +2,103 @@ _Note_: this table is **autogenerated** and should not be edited. It is intended to get an overview of contributions. - | GitHub | Paper writing | New dataset | Review PR | Bug fixes | Coordination | Dataset annotations | New task | Running Models | Total | -|:------------------|----------------:|--------------:|------------:|------------:|---------------:|----------------------:|-----------:|-----------------:|--------:| -| KennethEnevoldsen | 0 | 68 | 326 | 87 | 81 | 35 | 0 | 0 | 597 | -| isaac-chung | 12 | 120 | 194 | 50 | 54 | 1 | 2 | 0 | 433 | -| imenelydiaker | 0 | 120 | 144 | 24 | 70 | 0 | 0 | 0 | 358 | -| awinml | 0 | 300 | 2 | 0 | 0 | 0 | 0 | 0 | 302 | -| x-tabdeveloping | 0 | 144 | 32 | 10 | 41 | 0 | 12 | 0 | 239 | -| davidstap | 0 | 176 | 0 | 0 | 0 | 0 | 0 | 0 | 176 | -| jaygala24 | 0 | 149 | 0 | 0 | 0 | 0 | 0 | 0 | 149 | -| wissam-sib | 0 | 134 | 6 | 4 | 0 | 0 | 0 | 0 | 144 | -| Muennighoff | 0 | 0 | 48 | 0 | 70 | 0 | 0 | 24 | 142 | -| orionw | 0 | 0 | 20 | 20 | 75 | 0 | 10 | 0 | 125 | -| dokato | 0 | 94 | 6 | 12 | 0 | 0 | 0 | 0 | 112 | -| gentaiscool | 0 | 110 | 0 | 0 | 0 | 0 | 0 | 0 | 110 | -| jupyterjazz | 0 | 108 | 0 | 0 | 0 | 0 | 0 | 0 | 108 | -| SaitejaUtpala | 0 | 102 | 0 | 0 | 0 | 0 | 0 | 0 | 102 | -| vaibhavad | 0 | 6 | 4 | 8 | 75 | 0 | 0 | 0 | 93 | -| schmarion | 0 | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | -| MathieuCiancone | 0 | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | -| GabrielSequeira | 0 | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | -| digantamisra98 | 0 | 71 | 0 | 0 | 0 | 0 | 0 | 0 | 71 | -| shreeya-dhakal | 0 | 54 | 8 | 0 | 0 | 0 | 0 | 0 | 62 | -| Rysias | 0 | 58 | 0 | 0 | 0 | 0 | 0 | 0 | 58 | -| Samoed | 0 | 18 | 2 | 22 | 0 | 0 | 0 | 9 | 51 | -| sivareddyg | 0 | 0 | 0 | 0 | 50 | 0 | 0 | 0 | 50 | -| gowitheflow-1998 | 0 | 50 | 0 | 0 | 0 | 0 | 0 | 0 | 50 | -| asparius | 0 | 34 | 14 | 0 | 0 | 0 | 0 | 0 | 48 | -| Akash190104 | 0 | 46 | 0 | 0 | 0 | 0 | 0 | 0 | 46 | -| MartinBernstorff | 0 | 2 | 8 | 13 | 20 | 0 | 0 | 0 | 43 | -| akshita-sukhlecha | 0 | 36 | 0 | 4 | 0 | 0 | 0 | 0 | 40 | -| staoxiao | 0 | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 40 | -| bp-high | 0 | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | -| rafalposwiata | 0 | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | -| KranthiGV | 0 | 20 | 14 | 0 | 0 | 0 | 0 | 0 | 34 | -| loicmagne | 0 | 0 | 0 | 28 | 0 | 0 | 0 | 0 | 28 | -| ShawonAshraf | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| bjoernpl | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| jphme | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| rasdani | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| violenil | 0 | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | -| mariyahendriksen | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24 | -| dwzhu-pku | 0 | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 24 | -| hgissbkh | 3 | 0 | 2 | 13 | 0 | 0 | 5 | 0 | 23 | -| taeminlee | 0 | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | -| kwojtasi | 0 | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | -| jankounchained | 0 | 14 | 0 | 8 | 0 | 0 | 0 | 0 | 22 | -| tomaarsen | 0 | 0 | 2 | 0 | 20 | 0 | 0 | 0 | 22 | -| crystina-z | 0 | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | -| mrshu | 0 | 16 | 4 | 0 | 0 | 1 | 0 | 0 | 21 | -| john-b-yang | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | -| rbroc | 0 | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | -| mmhamdy | 0 | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | -| ManuelFay | 0 | 2 | 0 | 13 | 0 | 0 | 5 | 0 | 20 | -| AlexeyVatolin | 0 | 0 | 0 | 20 | 0 | 0 | 0 | 0 | 20 | -| Andrian0s | 0 | 14 | 4 | 2 | 0 | 0 | 0 | 0 | 20 | -| thakur-nandan | 0 | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | -| manandey | 0 | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | -| PranjalChitale | 0 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| dipam7 | 0 | 14 | 2 | 0 | 0 | 0 | 0 | 0 | 16 | -| sted97 | 0 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| Sakshamrzt | 0 | 12 | 4 | 0 | 0 | 0 | 0 | 0 | 16 | -| taidnguyen | 0 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | -| artemsnegirev | 0 | 12 | 0 | 0 | 0 | 2 | 0 | 0 | 14 | -| slvnwhrl | 0 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| anpalmak2003 | 0 | 9 | 0 | 0 | 0 | 3 | 0 | 0 | 12 | -| Art3mis07 | 0 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| guenthermi | 0 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| jordiclive | 0 | 2 | 0 | 10 | 0 | 0 | 0 | 0 | 12 | -| xhluca | 0 | 6 | 2 | 4 | 0 | 0 | 0 | 0 | 12 | -| henilp105 | 0 | 0 | 0 | 2 | 0 | 9 | 0 | 0 | 11 | -| MariyaTikhonova | 0 | 7 | 0 | 0 | 0 | 4 | 0 | 0 | 11 | -| ab1992ao | 0 | 8 | 0 | 0 | 0 | 3 | 0 | 0 | 11 | -| tmp_handle | 0 | 0 | 0 | 0 | 10 | 0 | 0 | 0 | 10 | -| swj0419 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| Ruqyai | 0 | 2 | 8 | 0 | 0 | 0 | 0 | 0 | 10 | -| ZhengLiu101 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| Alenush | 0 | 6 | 0 | 0 | 0 | 4 | 0 | 0 | 10 | -| ABorghini | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| simon-clematide | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| sarahooker | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| guangyusong | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| HLasse | 0 | 0 | 0 | 5 | 0 | 5 | 0 | 0 | 10 | -| cassanof | 0 | 8 | 0 | 1 | 0 | 0 | 0 | 1 | 10 | -| hongjin-su | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| xiamengzhou | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| xu3kev | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| howard-yen | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| malteos | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| ljvmiranda921 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| marcobellagente93 | 0 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| izhx | 0 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| MexicanLemonade | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| antoniolanza1996 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 | -| achibb | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| NouamaneTazi | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | -| PhilipMay | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | -| cslizc | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| bakrianoo | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| hanhainebula | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| monikernemo | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | \ No newline at end of file + | GitHub | New dataset | Review PR | Bug fixes | Coordination | Paper writing | Dataset annotations | Running Models | New task | Total | +|:------------------|--------------:|------------:|------------:|---------------:|----------------:|----------------------:|-----------------:|-----------:|--------:| +| KennethEnevoldsen | 68 | 326 | 87 | 81 | 0 | 35 | 0 | 0 | 597 | +| isaac-chung | 120 | 194 | 50 | 54 | 12 | 1 | 0 | 2 | 433 | +| imenelydiaker | 120 | 144 | 24 | 70 | 0 | 0 | 0 | 0 | 358 | +| awinml | 300 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 302 | +| x-tabdeveloping | 144 | 32 | 10 | 41 | 0 | 0 | 0 | 12 | 239 | +| davidstap | 176 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 176 | +| jaygala24 | 149 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 149 | +| wissam-sib | 134 | 6 | 4 | 0 | 0 | 0 | 0 | 0 | 144 | +| Muennighoff | 0 | 48 | 0 | 70 | 0 | 0 | 24 | 0 | 142 | +| orionw | 0 | 20 | 20 | 75 | 0 | 0 | 0 | 10 | 125 | +| dokato | 94 | 6 | 12 | 0 | 0 | 0 | 0 | 0 | 112 | +| gentaiscool | 110 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 110 | +| jupyterjazz | 108 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 108 | +| SaitejaUtpala | 102 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 102 | +| vaibhavad | 6 | 4 | 8 | 75 | 0 | 0 | 0 | 0 | 93 | +| schmarion | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | +| MathieuCiancone | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | +| GabrielSequeira | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | +| digantamisra98 | 71 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 71 | +| shreeya-dhakal | 54 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 62 | +| Rysias | 58 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 58 | +| Samoed | 18 | 2 | 22 | 0 | 0 | 0 | 9 | 0 | 51 | +| sivareddyg | 0 | 0 | 0 | 50 | 0 | 0 | 0 | 0 | 50 | +| gowitheflow-1998 | 50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 50 | +| asparius | 34 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 48 | +| Akash190104 | 46 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 46 | +| MartinBernstorff | 2 | 8 | 13 | 20 | 0 | 0 | 0 | 0 | 43 | +| akshita-sukhlecha | 36 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 40 | +| staoxiao | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 40 | +| bp-high | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | +| rafalposwiata | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | +| KranthiGV | 20 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 34 | +| loicmagne | 0 | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 28 | +| ShawonAshraf | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| bjoernpl | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| jphme | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| rasdani | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| violenil | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | +| mariyahendriksen | 0 | 0 | 0 | 0 | 24 | 0 | 0 | 0 | 24 | +| dwzhu-pku | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24 | +| hgissbkh | 0 | 2 | 13 | 0 | 3 | 0 | 0 | 5 | 23 | +| taeminlee | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | +| kwojtasi | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | +| jankounchained | 14 | 0 | 8 | 0 | 0 | 0 | 0 | 0 | 22 | +| tomaarsen | 0 | 2 | 0 | 20 | 0 | 0 | 0 | 0 | 22 | +| crystina-z | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | +| mrshu | 16 | 4 | 0 | 0 | 0 | 1 | 0 | 0 | 21 | +| john-b-yang | 0 | 0 | 0 | 0 | 20 | 0 | 0 | 0 | 20 | +| rbroc | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | +| mmhamdy | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | +| ManuelFay | 2 | 0 | 13 | 0 | 0 | 0 | 0 | 5 | 20 | +| AlexeyVatolin | 0 | 0 | 20 | 0 | 0 | 0 | 0 | 0 | 20 | +| Andrian0s | 14 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 20 | +| thakur-nandan | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | +| manandey | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | +| PranjalChitale | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| dipam7 | 14 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| sted97 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| Sakshamrzt | 12 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| taidnguyen | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | +| artemsnegirev | 12 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 14 | +| slvnwhrl | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| anpalmak2003 | 9 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 12 | +| Art3mis07 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| guenthermi | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| jordiclive | 2 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 12 | +| xhluca | 6 | 2 | 4 | 0 | 0 | 0 | 0 | 0 | 12 | +| henilp105 | 0 | 0 | 2 | 0 | 0 | 9 | 0 | 0 | 11 | +| MariyaTikhonova | 7 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 11 | +| ab1992ao | 8 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 11 | +| tmp_handle | 0 | 0 | 0 | 10 | 0 | 0 | 0 | 0 | 10 | +| swj0419 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| Ruqyai | 2 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| ZhengLiu101 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| Alenush | 6 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 10 | +| ABorghini | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| simon-clematide | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| sarahooker | 0 | 0 | 0 | 0 | 10 | 0 | 0 | 0 | 10 | +| guangyusong | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| HLasse | 0 | 0 | 5 | 0 | 0 | 5 | 0 | 0 | 10 | +| cassanof | 8 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 10 | +| hongjin-su | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| xiamengzhou | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| xu3kev | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| howard-yen | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| malteos | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| ljvmiranda921 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| marcobellagente93 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| izhx | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| MexicanLemonade | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| antoniolanza1996 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | +| achibb | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| NouamaneTazi | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| PhilipMay | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| cslizc | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| bakrianoo | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| hanhainebula | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| monikernemo | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | \ No newline at end of file From 186cc23fc8fa481e1113ef86dc8a69e4504b0ac8 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 15 Jan 2025 14:13:52 +0000 Subject: [PATCH 13/49] 1.29.5 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 153a22088..9a5459793 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.4" +version = "1.29.5" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 748955c367b5c549f4b8d54945361f5bbc7184f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 15 Jan 2025 16:26:22 +0100 Subject: [PATCH 14/49] fix: Added more Chinese models' `ModelMeta` (#1814) * Added Multilingual USE models * Added Moka models * Added dmeta models * Added jina-zh * Added piccolo models --- mteb/models/misc_models.py | 47 ++++++ mteb/models/moka_models.py | 150 ++++++++++++++++++++ mteb/models/overview.py | 1 + mteb/models/piccolo_models.py | 50 +++++++ mteb/models/sentence_transformers_models.py | 107 +++++++++++++- 5 files changed, 354 insertions(+), 1 deletion(-) create mode 100644 mteb/models/moka_models.py create mode 100644 mteb/models/piccolo_models.py diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index d05461af1..5e8fcae0a 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -1661,3 +1661,50 @@ adapted_from="intfloat/e5-mistral-7b-instruct", superseded_by=None, ) +sbert_chinese_general_v1 = ModelMeta( + name="DMetaSoul/sbert-chinese-general-v1", + revision="bd27765956bcc2fcf682de0097819947ac10037e", + release_date="2022-03-25", + languages=["zho_Hans"], + loader=None, + n_parameters=None, # Not visible on repo + memory_usage=None, + max_tokens=512, + embed_dim=128, + license="apache-2", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/DMetaSoul/sbert-chinese-general-v1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={ + "PAWSX": ["train"], + "PawsXPairClassification": ["train"], # they do not specify which one + # They might have trained on other datasets too, they don't say: + # "trained on semantically similar datasets such as NLI, PAWS-X, PKU-Paraphrase-Bank, and STS." + }, + superseded_by=None, +) +dmeta_embedding_zh_small = ModelMeta( + name="DMetaSoul/Dmeta-embedding-zh-small", + revision="2050d3439a2f68999dd648c1697471acaac37a29", + release_date="2024-03-25", + languages=["zho_Hans"], + loader=None, + n_parameters=74.2 * 1e6, + memory_usage=None, + max_tokens=1024, + embed_dim=768, + license="apache-2", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/DMetaSoul/Dmeta-embedding-zh-small/", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, # They don't specify + superseded_by=None, +) diff --git a/mteb/models/moka_models.py b/mteb/models/moka_models.py new file mode 100644 index 000000000..cf9b96f88 --- /dev/null +++ b/mteb/models/moka_models.py @@ -0,0 +1,150 @@ +"""Moka AI's Chinese embedding models""" + +from __future__ import annotations + +from mteb.model_meta import ModelMeta + +sent_trf_training_dataset = { + # derived from datasheets + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], +} +medi_dataset = { + **sent_trf_training_dataset, + # not in MTEB: + # - Super-NI + # - KILT (https://arxiv.org/abs/2009.02252) + # - MedMCQA (https://proceedings.mlr.press/v174/pal22a/pal22a.pdf) +} +m3e_dataset = { + **medi_dataset, + "AmazonReviewsClassification": ["train"], # Possibly also test, hard to know + "Ocnli": ["train"], + "BQ": ["train"], + "LCQMC": ["train"], + "MIRACLReranking": ["train"], + "PAWSX": ["train"], + # not in MTEB: + # - cmrc2018 + # - belle_2m + # - firefily + # - alpaca_gpt4 + # - zhihu_kol + # - hc3_chinese + # - amazon_reviews_multi (intersects with AmazonReviewsClassification) + # - qa: Encyclopedia QA dataset + # - xlsum + # - wiki_atomic_edit + # - chatmed_consult + # - webqa + # - dureader_robust + # - csl + # - lawzhidao + # - CINLID + # - DuSQL + # - Zhuiyi-NL2SQL + # - Cspider + # - news2016zh + # - baike2018qa + # - webtext2019zh + # - SimCLUE + # - SQuAD +} + +m3e_base = ModelMeta( + name="moka-ai/m3e-base", + languages=["zho_Hans", "eng-Latn"], + open_weights=True, + revision="764b537a0e50e5c7d64db883f2d2e051cbe3c64c", + release_date="2023-06-06", # first commit + n_parameters=102 * 1e6, + memory_usage=None, + embed_dim=768, + # They don't give a specific license but commercial use is not allowed + license="unspecified-noncommercial", + max_tokens=512, + reference="https://huggingface.co/moka-ai/m3e-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=False, # Not published + public_training_data=False, # They haven't published it yet + training_datasets=m3e_dataset, +) + +m3e_small = ModelMeta( + name="moka-ai/m3e-small", + languages=["zho_Hans", "eng-Latn"], + open_weights=True, + revision="44c696631b2a8c200220aaaad5f987f096e986df", + release_date="2023-06-02", # first commit + n_parameters=None, # Can't be seen on HF page + memory_usage=None, + embed_dim=512, + # They don't give a specific license but commercial use is not allowed + license="unspecified-noncommercial", + max_tokens=512, + reference="https://huggingface.co/moka-ai/m3e-small", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=False, # Not published + public_training_data=False, # They haven't published it yet + training_datasets=m3e_dataset, +) + + +m3e_large = ModelMeta( + name="moka-ai/m3e-large", + languages=["zho_Hans", "eng-Latn"], + open_weights=True, + revision="12900375086c37ba5d83d1e417b21dc7d1d1f388", + release_date="2023-06-21", # first commit + n_parameters=None, # Can't be seen on HF page + memory_usage=None, + embed_dim=768, + # They don't give a specific license but commercial use is not allowed + license="unspecified-noncommercial", + max_tokens=512, + reference="https://huggingface.co/moka-ai/m3e-large", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=False, # Not published + public_training_data=False, # They haven't published it yet + training_datasets=m3e_dataset, +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 4e19bed19..f1abb1014 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -29,6 +29,7 @@ llm2vec_models, misc_models, model2vec_models, + moka_models, mxbai_models, no_instruct_sentence_models, nomic_models, diff --git a/mteb/models/piccolo_models.py b/mteb/models/piccolo_models.py new file mode 100644 index 000000000..17ea1fc2a --- /dev/null +++ b/mteb/models/piccolo_models.py @@ -0,0 +1,50 @@ +"""Piccolo Chinese embedding models by SenseNova""" + +from __future__ import annotations + +from mteb.model_meta import ModelMeta + +piccolo_base_zh = ModelMeta( + name="sensenova/piccolo-base-zh", + languages=["zho_Hans"], + open_weights=True, + revision="47c0a63b8f667c3482e05b2fd45577bb19252196", + release_date="2023-09-04", # first commit + n_parameters=None, # can't see on model card + memory_usage=None, + embed_dim=768, + license="mit", + max_tokens=512, + reference="https://huggingface.co/sensenova/piccolo-base-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=False, + public_training_data=False, + training_datasets=None, # They don't specify +) + +piccolo_large_zh_v2 = ModelMeta( + name="sensenova/piccolo-large-zh-v2", + languages=["zho_Hans"], + open_weights=False, # They "temporarily" removed it in may last year + # "Due to certain internal company considerations" + revision="05948c1d889355936bdf9db7d30df57dd78d25a3", + release_date="2024-04-22", # first commit + n_parameters=None, # we don't know because they removed the model + memory_usage=None, + embed_dim=1024, + license="not specified", + max_tokens=512, + reference="https://huggingface.co/sensenova/piccolo-large-zh-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=False, + public_training_data=False, + training_datasets=None, # They don't say +) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 28349d60d..18b08f16f 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -218,7 +218,7 @@ "NQ": ["test"], "NQHardNegatives": ["test"], "MSMARCO": ["train"], - # Non MTEB sources + # Non MTEB source # "s2orc": ["train"], # "flax-sentence-embeddings/stackexchange_xml": ["train"], # "ms_marco": ["train"], @@ -242,6 +242,82 @@ }, ) +# Source: https://arxiv.org/pdf/1907.04307 +use_multilingual_languages = [ + "ara-Arab", # Arabic + "zho-Hans", # Chinese (Simplified, PRC) + "zho-Hant", # Chinese (Traditional, Taiwan) + "nld-Latn", # Dutch + "eng-Latn", # English + "deu-Latn", # German + "fra-Latn", # French + "ita-Latn", # Italian + "por-Latn", # Portuguese + "spa-Latn", # Spanish + "jpn-Jpan", # Japanese + "kor-Kore", # Korean + "rus-Cyrl", # Russian + "pol-Latn", # Polish + "tha-Thai", # Thai + "tur-Latn", # Turkish +] +use_multilingual_training_data = { + # I'm not certain since they mined this themselves, but I would assume that there is significant overlap + "StackOverflowQARetrieval": ["train", "test"], + # Not in MTEB: + # - SNLI translated to 15 languages (could have intersections with other NLI datasets) + # - Translation pairs: Mined from the internet + # - QA mined from Reddit, StackOverflow, YahooAnswers (could be problematic) +} +distiluse_base_multilingual_cased_v2 = ModelMeta( + name="sentence-transformers/distiluse-base-multilingual-cased-v2", + languages=use_multilingual_languages, + open_weights=True, + revision="dad0fa1ee4fa6e982d3adbce87c73c02e6aee838", + release_date="2021-06-22", # First commit + n_parameters=135 * 1e6, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=True, + public_training_data=True, + training_datasets=use_multilingual_training_data, +) + +use_cmlm_multilingual = ModelMeta( + name="sentence-transformers/use-cmlm-multilingual", + languages=paraphrase_langs, + open_weights=True, + revision="6f8ff6583c371cbc4d6d3b93a5e37a888fd54574", + release_date="2022-04-14", # First commit + n_parameters=472 * 1e6, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=256, + reference="https://huggingface.co/sentence-transformers/use-cmlm-multilingual", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from="sentence-transformers/LaBSE", + public_training_code=True, + public_training_data=True, + training_datasets={ + # Not in MTEB: + # - SNLI + # - Translation corpus based largely on Uszkoreit et al. (2010) + }, +) + + jina_embeddings_v2_base_en = ModelMeta( name="jinaai/jina-embeddings-v2-base-en", languages=["eng-Latn"], @@ -262,6 +338,35 @@ training_datasets={"allenai/c4": ["train"]}, ) +jina_embeddings_v2_base_zh = ModelMeta( + name="jinaai/jina-embeddings-v2-base-zh", + languages=["eng-Latn", "zho-Hans"], + open_weights=True, + revision="c1ff9086a89a1123d7b5eff58055a665db4fb4b9", + release_date="2024-01-10", + n_parameters=161_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets={ + # source: https://arxiv.org/pdf/2402.17016 + "XNLI": ["train"], + "MLSumClusteringS2S": ["train"], + "MLSumClusteringP2P": ["train"], + # Not in MTEB: + # - MQA + # - XLSUM + }, +) + + jina_embeddings_v2_small_en = ModelMeta( name="jinaai/jina-embeddings-v2-small-en", languages=["eng-Latn"], From 950f050e330e855b36c885f62c3baa55f9ce8fbb Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 15 Jan 2025 15:41:42 +0000 Subject: [PATCH 15/49] 1.29.6 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9a5459793..53d56ef4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.5" +version = "1.29.6" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 60c49804fe0bbf10a4dde7cc63a9002f5eee6d40 Mon Sep 17 00:00:00 2001 From: Samuel Yang Date: Thu, 16 Jan 2025 00:18:44 +0800 Subject: [PATCH 16/49] Add model inf-retriever-v1 (#1744) * feat(models): add infly/inf-retriever-v1 model metadata- Add inf_models.py file with metadata for infly/inf-retriever-v1 model - Update overview.py to include inf_models in model imports * Reformat code * Update inf-retriever-v1 ModelMeta * Fill more information for inf-retriever-v1 * Add license information for inf-retriever-v1 --------- Co-authored-by: Samuel Yang --- mteb/models/inf_models.py | 32 ++++++++++++++++++++++++++++++++ mteb/models/overview.py | 2 ++ 2 files changed, 34 insertions(+) create mode 100644 mteb/models/inf_models.py diff --git a/mteb/models/inf_models.py b/mteb/models/inf_models.py new file mode 100644 index 000000000..4670b2073 --- /dev/null +++ b/mteb/models/inf_models.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from functools import partial + +from mteb.model_meta import ModelMeta, sentence_transformers_loader + +inf_retriever_v1 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="infly/inf-retriever-v1", + revision="d2d074546028c0012b5cc6af78c4fac24896e67f", + trust_remote_code=True, + ), + name="infly/inf-retriever-v1", + languages=["eng_Latn", "zho_Hans"], + open_weights=True, + revision="d2d074546028c0012b5cc6af78c4fac24896e67f", + release_date="2024-12-24", # initial commit of hf model. + n_parameters=7_069_121_024, + memory_usage=None, + embed_dim=3584, + license="apache-2.0", + max_tokens=131_072, + reference="https://huggingface.co/infly/inf-retriever-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + adapted_from="Alibaba-NLP/gte-Qwen2-7B-instruct", + public_training_code=False, + public_training_data=False, + training_datasets=None, +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index f1abb1014..634530089 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -23,6 +23,7 @@ gritlm_models, gte_models, ibm_granite_models, + inf_models, jasper_models, jina_models, linq_models, @@ -62,6 +63,7 @@ gritlm_models, gte_models, ibm_granite_models, + inf_models, jina_models, linq_models, llm2vec_models, From 54018c799b2fa9ea30fbbd06557318a6ffedc85b Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Wed, 15 Jan 2025 21:29:05 +0500 Subject: [PATCH 17/49] [v2] Remove deprecated parameters from `MTEB` and cli (#1773) * remove deprecated parameters * remove _task_langs * lint * fixes * fixes * fixes * fix all abs tasks * change to get tasks * try to fix * final fixes * back to tuple * update args description --- mteb/cli.py | 21 +-- mteb/evaluation/MTEB.py | 203 +++---------------------- mteb/overview.py | 2 +- tests/test_benchmark/task_grid.py | 52 +++---- tests/test_benchmark/test_benchmark.py | 4 +- tests/test_overview.py | 5 - tests/test_reproducible_workflow.py | 4 +- tests/test_tasks/test_all_abstasks.py | 9 +- tests/test_tasks/test_mteb_rerank.py | 9 +- 9 files changed, 56 insertions(+), 253 deletions(-) diff --git a/mteb/cli.py b/mteb/cli.py index 3c6c821f5..c552394e4 100644 --- a/mteb/cli.py +++ b/mteb/cli.py @@ -374,26 +374,7 @@ def main(): add_create_meta_parser(subparsers) args = parser.parse_args() - - # If no subcommand is provided, default to run with a deprecation warning - if not hasattr(args, "func"): - logger.warning( - "Using `mteb` without a subcommand is deprecated. Use `mteb run` instead.", - DeprecationWarning, - ) - # Set default arguments for 'run' if no subcommand is provided - default_args = parser.parse_args( - ["run"] - + list(map(str, args._get_args())) - + [ - f"--{k}" if v is None else f"--{k}={v}" - for k, v in vars(args).items() - if k != "func" - ] - ) - default_args.func(default_args) - else: - args.func(args) + args.func(args) if __name__ == "__main__": diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index 0c07ff34d..3c94f2478 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -5,28 +5,29 @@ import os import traceback from collections.abc import Iterable -from copy import copy, deepcopy +from copy import deepcopy from datetime import datetime from itertools import chain from pathlib import Path from time import time -from typing import Any +from typing import TYPE_CHECKING, Any import datasets from codecarbon import EmissionsTracker from sentence_transformers import CrossEncoder, SentenceTransformer +import mteb from mteb.abstasks.AbsTask import ScoresDict from mteb.encoder_interface import Encoder from mteb.model_meta import ModelMeta from mteb.models import model_meta_from_sentence_transformers from ..abstasks.AbsTask import AbsTask -from ..abstasks.AbsTaskMultilabelClassification import AbsTaskMultilabelClassification -from ..abstasks.AbsTaskReranking import AbsTaskReranking from ..load_results.task_results import TaskResult from ..models.sentence_transformer_wrapper import SentenceTransformerWrapper -from . import LangMapping + +if TYPE_CHECKING: + from mteb.benchmarks import Benchmark logger = logging.getLogger(__name__) @@ -34,124 +35,41 @@ class MTEB: def __init__( self, - tasks: Iterable[str | AbsTask] | None = None, + tasks: Iterable[AbsTask | Benchmark], *, - task_types: list[str] | None = None, - task_categories: list[str] | None = None, - task_langs: list[str] | None = None, - version=None, err_logs_path: str = "error_logs.txt", - **kwargs, ): """Create an Evaluation pipeline, based on the provided tasks. Args: - tasks: List of tasks to be evaluated. - task_types: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of task types (Clustering, Retrieval..) to be - evaluated. If None, all tasks will be evaluated - task_categories: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of task categories (s2s, p2p..) to be - evaluated. If None, all tasks will be evaluated - task_langs: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of languages to be evaluated. if None, all - languages will be evaluated. ["eng-Latn", "deu_Latn"] will evaluate on all tasks with these languages. - version: Will be deprecated. Version of the benchmark to use. If None, latest is used + tasks: List of tasks or benchmarks to be evaluated, e.g. tasks returned by + `mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)"). err_logs_path: Path to save error logs. - kwargs: Additional arguments to be passed to the tasks """ from mteb.benchmarks import Benchmark - self.deprecation_warning( - task_types, task_categories, task_langs, tasks, version - ) - - if tasks is not None: - self._tasks = tasks - if isinstance(tasks[0], Benchmark): - self.benchmarks = tasks - self._tasks = list(chain.from_iterable(tasks)) - assert ( - task_types is None and task_categories is None - ), "Cannot specify both `tasks` and `task_types`/`task_categories`" - else: - self._task_types = task_types - self._task_categories = task_categories - self._tasks = None - - self._task_langs = task_langs if task_langs is not None else [] - if isinstance(self._task_langs, str): - self._task_langs = [self._task_langs] + self.tasks = list(tasks) + if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark): + self.benchmarks = tasks + self.tasks = list(chain.from_iterable(self.tasks)) - self._extend_lang_code() - self._extend_lang_pairs() # add all possible pairs - - self._version = version self.err_logs_path = err_logs_path - self.last_evaluated_splits = {} - self.select_tasks(**kwargs) - - def deprecation_warning( - self, task_types, task_categories, task_langs, tasks, version - ): - if task_types is not None: - logger.warning( - "The `task_types` argument is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(... task_types = [...])` to filter tasks instead." - ) - if task_categories is not None: - logger.warning( - "The `task_categories` argument is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(... categories = [...])` to filter tasks instead." - ) - if task_langs is not None: - logger.warning( - "The `task_langs` argument is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(... languages = [...])` to filter tasks instead. " - + "Note that this uses 3 letter language codes (ISO 639-3)." - ) - if version is not None: - logger.warning( - "The `version` argument is deprecated and will be removed in the next release." - ) - task_contains_strings = any(isinstance(x, str) for x in tasks or []) - if task_contains_strings: - logger.warning( - "Passing task names as strings is deprecated and will be removed in the next release. " - + "Please use `tasks = mteb.get_tasks(tasks=[...])` method to get tasks instead." - ) - @property def available_tasks(self): - return [x.metadata.name for x in self.tasks_cls] + return [x.metadata.name for x in self.tasks] @property def available_task_types(self): # sort the task types - return sorted({x.metadata.type for x in self.tasks_cls}) + return sorted({x.metadata.type for x in self.tasks}) @property def available_task_categories(self): - return {x.metadata.category for x in self.tasks_cls} - - def _extend_lang_code(self): - # add all possible language codes - for lang in set(self._task_langs): - if lang in LangMapping.LANG_MAPPING: - self._task_langs += LangMapping.LANG_MAPPING[lang] - - def _extend_lang_pairs(self): - # add all possible language pairs - langs = set(self._task_langs) - for x in langs: - if "-" not in x: - for y in langs: - if "-" not in y: - pair = f"{x}-{y}" - if pair not in langs: - self._task_langs.append(pair) - return - - def _display_tasks(self, task_list, name=None): + return {x.metadata.category for x in self.tasks} + + def _display_tasks(self, task_list: Iterable[AbsTask], name: str | None = None): from rich.console import Console # disable logging for other ranks @@ -215,80 +133,14 @@ def mteb_benchmarks(self): @classmethod def mteb_tasks(cls): """Get all tasks available in the MTEB.""" - instance = cls() - instance._display_tasks(instance.tasks_cls, name="MTEB tasks") + tasks = mteb.get_tasks() + instance = cls(tasks) + instance._display_tasks(tasks, name="MTEB tasks") def print_selected_tasks(self): """Print the selected tasks.""" self._display_tasks(self.tasks, name="Selected tasks") - def select_tasks(self, **kwargs): - """Select the tasks to be evaluated.""" - # Get all existing tasks - # reranking and multiclassClassification subclasses retrieval to share methods, but is an abstract task - tasks_categories_cls = list(AbsTask.__subclasses__()) + [ - AbsTaskReranking, - AbsTaskMultilabelClassification, - ] - all_task_classes = [] - for cat_cls in tasks_categories_cls: - for cls in cat_cls.__subclasses__(): - if cat_cls.__name__.startswith("AbsTask") and cls.__name__ not in ( - "AbsTaskReranking", - "AbsTaskMultilabelClassification", - ): - task = cls(hf_subsets=self._task_langs, **kwargs) - all_task_classes.append(task) - - self.tasks_cls = all_task_classes - - # If `task_list` is specified, select list of tasks - if self._tasks is not None: - self.tasks = list( - filter(lambda x: (x.metadata.name in self._tasks), self.tasks_cls) - ) - if len(self.tasks) != len(self._tasks): - tasks_known = {x.metadata.name for x in self.tasks_cls} - tasks_unknown = { - x for x in self._tasks if isinstance(x, str) - } - tasks_known - if tasks_unknown: - unknown_str, known_str = ( - ",".join(sorted(tasks_unknown)), - ",".join(sorted(tasks_known)), - ) - logger.warning( - f"WARNING: Unknown tasks: {unknown_str}. Known tasks: {known_str}." - ) - # add task if subclass of mteb.tasks - self.tasks.extend([x for x in self._tasks if isinstance(x, AbsTask)]) - return - - # Otherwise use filters to select tasks - filtered_tasks = filter( - lambda x: (self._task_types is None) - or (x.metadata.type in self._task_types), - self.tasks_cls, - ) - filtered_tasks = filter( - lambda x: (self._task_categories is None) - or (x.metadata.category in self._task_categories), - filtered_tasks, - ) - filtered_tasks = filter( - lambda x: (self._version is None) or (x.metadata.version >= self._version), - filtered_tasks, - ) - # keep only tasks with at least one language in the filter - filtered_tasks = filter( - lambda x: (not self._task_langs) - or (len(set(x.metadata.eval_langs) & set(self._task_langs)) > 0), - filtered_tasks, - ) - - # Get final list of tasks - self.tasks = list(filtered_tasks) - def load_tasks_data(self): """Load datasets for the selected tasks.""" logger.info(f"\n\n## Loading datasets for {len(self.tasks)} tasks") @@ -416,13 +268,6 @@ def run( Returns: A list of TaskResult objects, one for each task evaluated. """ - if "batch_size" in kwargs: - logger.warning( - "The `batch_size` argument is deprecated and will be removed in the next release. " - + "Please use `encode_kwargs = {'batch_size': ...}` to set the batch size instead." - ) - encode_kwargs["batch_size"] = kwargs["batch_size"] - # update logging to account for different levels of Verbosity (similar to the command line) if verbosity == 0: @@ -455,8 +300,8 @@ def run( self.print_selected_tasks() evaluation_results = [] - original_tasks = ( - self.tasks.copy() + original_tasks = deepcopy( + self.tasks ) # save them in case we re-use the object (e.g. for reranking) # To evaluate missing splits, we keep track of the task name and the corresponding splits. @@ -665,7 +510,7 @@ def create_model_meta(model: Encoder) -> ModelMeta: ) # create a copy of the meta to avoid modifying the original object - meta = copy(meta) + meta = deepcopy(meta) meta.revision = meta.revision or "no_revision_available" meta.name = meta.name or "no_model_name_available" diff --git a/mteb/overview.py b/mteb/overview.py index 5846993b0..39d96041b 100644 --- a/mteb/overview.py +++ b/mteb/overview.py @@ -124,7 +124,7 @@ def __repr__(self) -> str: return "MTEBTasks" + super().__repr__() @staticmethod - def _extract_property_from_task(task, property): + def _extract_property_from_task(task, property: str): if hasattr(task.metadata, property): return getattr(task.metadata, property) elif hasattr(task, property): diff --git a/tests/test_benchmark/task_grid.py b/tests/test_benchmark/task_grid.py index 8ae310555..3ad484b6f 100644 --- a/tests/test_benchmark/task_grid.py +++ b/tests/test_benchmark/task_grid.py @@ -2,14 +2,8 @@ from __future__ import annotations +import mteb from mteb.abstasks import AbsTask -from mteb.tasks.BitextMining.dan.BornholmskBitextMining import BornholmBitextMining -from mteb.tasks.Classification.multilingual.IndicSentimentClassification import ( - IndicSentimentClassification, -) -from mteb.tasks.Clustering.eng.TwentyNewsgroupsClustering import ( - TwentyNewsgroupsClusteringFast, -) from .mock_tasks import ( MockBitextMiningTask, @@ -39,31 +33,25 @@ MockSummarizationTask, ) -twenty_news = TwentyNewsgroupsClusteringFast() - -# downsample to speed up tests -twenty_news.max_document_to_embed = 1000 -twenty_news.n_clusters = 2 -twenty_news.max_fraction_of_documents_to_embed = None - -TASK_TEST_GRID = [ - BornholmBitextMining(), # bitext mining + just supplying a task class instead of a string - IndicSentimentClassification( # multi subset loader - hf_subsets=["as"], # we only load one subset here to speed up tests - n_experiments=2, # to speed up the test - ), - "TwentyNewsgroupsClustering", # clustering and string instead of class - twenty_news, # fast clustering - "Banking77Classification", # classification - "SciDocsRR", # reranking - "FarsTail", # pair classification - "TwitterHjerneRetrieval", # retrieval - "BrazilianToxicTweetsClassification", # multilabel classification - "FaroeseSTS", # STS - "SummEval", # summarization - "Core17InstructionRetrieval", # instruction reranking - "InstructIR", # instruction retrieval -] +TASK_TEST_GRID = ( + mteb.get_tasks( + tasks=[ + "BornholmBitextMining", # bitext mining + just supplying a task class instead of a string + "TwentyNewsgroupsClustering", # clustering and string instead of class + "TwentyNewsgroupsClustering.v2", # fast clustering + "Banking77Classification", # classification + "SciDocsRR", # reranking + "FarsTail", # pair classification + "TwitterHjerneRetrieval", # retrieval + "BrazilianToxicTweetsClassification", # multilabel classification + "FaroeseSTS", # STS + "SummEval", # summarization + "Core17InstructionRetrieval", # instruction reranking + "InstructIR", # instruction retrieval + ] + ) + + mteb.get_tasks(tasks=["IndicSentimentClassification"], languages=["asm-Beng"]) +) TASK_TEST_GRID_AS_STRING = [ t.metadata.name if isinstance(t, AbsTask) else t for t in TASK_TEST_GRID diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index 0c8521578..1393d46f1 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -67,7 +67,7 @@ def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder): eval.run(model, output_folder="tests/results", overwrite_results=True) -@pytest.mark.parametrize("task", [MockMultilingualRetrievalTask]) +@pytest.mark.parametrize("task", [MockMultilingualRetrievalTask()]) @pytest.mark.parametrize( "model", [MockSentenceTransformer()], @@ -188,7 +188,7 @@ def test_run_using_benchmark(model: mteb.Encoder): name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"]) ) - eval = mteb.MTEB(tasks=bench) + eval = mteb.MTEB(tasks=[bench]) eval.run( model, output_folder="tests/results", overwrite_results=True ) # we just want to test that it runs diff --git a/tests/test_overview.py b/tests/test_overview.py index 127e54f27..6136af1ea 100644 --- a/tests/test_overview.py +++ b/tests/test_overview.py @@ -98,8 +98,3 @@ def test_MTEBTasks( # check for header of a table n_langs = len(tasks) assert len(tasks.to_markdown().split("\n")) - 3 == n_langs - - -def test_all_tasks_fetch(): - """Test that all tasks can be fetched""" - mteb.MTEB.mteb_tasks() diff --git a/tests/test_reproducible_workflow.py b/tests/test_reproducible_workflow.py index 566864a11..1c7536076 100644 --- a/tests/test_reproducible_workflow.py +++ b/tests/test_reproducible_workflow.py @@ -36,7 +36,7 @@ def test_reproducibility_workflow(task_name: str, model_name: str, model_revisio @pytest.mark.parametrize( "task_name", TASK_TEST_GRID - + [ + + ( "BitextMining", "Classification", "MultilabelClassification", @@ -49,7 +49,7 @@ def test_reproducibility_workflow(task_name: str, model_name: str, model_revisio "InstructionRetrieval", "InstructionReranking", "Speed", - ], + ), ) def test_validate_task_to_prompt_name(task_name: str | AbsTask): if isinstance(task_name, AbsTask): diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index af6613327..91a7b9507 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -8,20 +8,17 @@ import pytest import mteb -from mteb import MTEB from mteb.abstasks import AbsTask, MultilingualTask from mteb.abstasks.AbsTaskReranking import AbsTaskReranking from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval from mteb.abstasks.AbsTaskSpeedTask import AbsTaskSpeedTask -from mteb.overview import TASKS_REGISTRY +from mteb.overview import TASKS_REGISTRY, get_tasks from ..test_benchmark.task_grid import MOCK_TASK_TEST_GRID_AS_STRING logging.basicConfig(level=logging.INFO) -tasks = [ - t for t in MTEB().tasks_cls if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING -] +tasks = [t for t in get_tasks() if t.metadata.name not in MOCK_TASK_TEST_GRID_AS_STRING] @pytest.mark.parametrize("task", tasks) @@ -84,7 +81,7 @@ async def check_datasets_are_available_on_hf(tasks): def test_dataset_availability(): """Checks if the datasets are available on Hugging Face using both their name and revision.""" - tasks = MTEB().tasks_cls + tasks = get_tasks() tasks = [ t for t in tasks diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index c540bb41e..565b00e22 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -6,6 +6,7 @@ from sentence_transformers import CrossEncoder, SentenceTransformer +import mteb from mteb import MTEB from mteb.model_meta import ModelMeta @@ -318,11 +319,7 @@ def test_mteb_rerank(tmp_path: Path): "1395", ] model = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2") - eval = MTEB( - tasks=[ - "SciFact", - ] - ) + eval = MTEB(tasks=mteb.get_tasks(["SciFact"])) # create fake first stage results tmp_file = tmp_path / "tmp.json" with open(tmp_file, "w") as f: @@ -374,7 +371,7 @@ def test_reranker_same_ndcg1(): revision=ce_revision, release_date="2021-04-15", ) - eval = MTEB(tasks=["SciFact"]) + eval = MTEB(tasks=mteb.get_tasks(["SciFact"])) eval.run( de, output_folder="tests/results/stage1", From 3a5aa0c1e5d57507841205a8708c9dbc21557991 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Thu, 16 Jan 2025 01:39:12 +0500 Subject: [PATCH 18/49] [v2] remove metadata_dict (#1820) * remove metadata_dict * Update mteb/overview.py Co-authored-by: Isaac Chung --------- Co-authored-by: Isaac Chung --- mteb/abstasks/AbsTask.py | 4 ---- mteb/overview.py | 4 +--- scripts/data/create_task_table.py | 11 +++-------- 3 files changed, 4 insertions(+), 15 deletions(-) diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index 399157757..e8876f3ff 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -278,10 +278,6 @@ def _calculate_metrics_from_split( ) -> DescriptiveStatistics: raise NotImplementedError - @property - def metadata_dict(self) -> dict[str, Any]: - return dict(self.metadata) - @property def languages(self) -> list[str]: """Returns the languages of the task""" diff --git a/mteb/overview.py b/mteb/overview.py index 39d96041b..31bc5130e 100644 --- a/mteb/overview.py +++ b/mteb/overview.py @@ -124,13 +124,11 @@ def __repr__(self) -> str: return "MTEBTasks" + super().__repr__() @staticmethod - def _extract_property_from_task(task, property: str): + def _extract_property_from_task(task: AbsTask, property: str): if hasattr(task.metadata, property): return getattr(task.metadata, property) elif hasattr(task, property): return getattr(task, property) - elif property in task.metadata_dict: - return task.metadata_dict[property] else: raise KeyError("Property neither in Task attribute or in task metadata.") diff --git a/scripts/data/create_task_table.py b/scripts/data/create_task_table.py index e5b292a08..e15edb482 100644 --- a/scripts/data/create_task_table.py +++ b/scripts/data/create_task_table.py @@ -137,14 +137,9 @@ def get_ds_stats(hf_hub_name): # Select all tasks for task in MTEB().tasks: print("Task: ", task) - if "dataset" in task.metadata_dict: - hub_name = hub_url = task.metadata.dataset["path"] - ds_stats = get_ds_stats(hub_name.split("/")[-1]) - elif "beir_name" in task.metadata_dict: - hub_name = hub_url = "BeIR/" + task.metadata_dict.get("beir_name") - ds_stats = get_ds_stats_beir("/".join(hub_name.split("/")[1:])) - if "cqadupstack" in hub_name: - hub_url = "BeIR/cqadupstack-qrels" + hub_name = hub_url = task.metadata.dataset["path"] + ds_stats = get_ds_stats(hub_name.split("/")[-1]) + TABLE_STRING += "\n" + ONE_LINE.format( f"[{task.metadata.name}]({task.metadata.reference})", f"[{hub_name}](https://huggingface.co/datasets/{hub_url})", From d7a77918cc0e8b8f03cbbe5199e8a0fe58e429d9 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Thu, 16 Jan 2025 09:43:11 +0900 Subject: [PATCH 19/49] ci: only return 1 model_name per file (#1818) * only return 1 model_name per file * fix args parse * revert test change --- Makefile | 2 +- scripts/extract_model_names.py | 41 ++++++++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 02d0ba247..9729d080f 100644 --- a/Makefile +++ b/Makefile @@ -41,5 +41,5 @@ build-docs: model-load-test: @echo "--- 🚀 Running model load test ---" pip install ".[dev, speedtask, pylate,gritlm,xformers,model2vec]" - python scripts/extract_model_names.py $(BASE_BRANCH) + python scripts/extract_model_names.py $(BASE_BRANCH) --return_one_model_name_per_file python tests/test_models/model_loading.py --model_name_file scripts/model_names.txt \ No newline at end of file diff --git a/scripts/extract_model_names.py b/scripts/extract_model_names.py index ba1bc1a8b..6cbaa2c29 100644 --- a/scripts/extract_model_names.py +++ b/scripts/extract_model_names.py @@ -1,11 +1,14 @@ from __future__ import annotations +import argparse import ast -import sys +import logging from pathlib import Path from git import Repo +logging.basicConfig(level=logging.INFO) + def get_changed_files(base_branch="main"): repo_path = Path(__file__).parent.parent @@ -28,8 +31,11 @@ def get_changed_files(base_branch="main"): ] -def extract_model_names(files: list[str]) -> list[str]: +def extract_model_names( + files: list[str], return_one_model_name_per_file=False +) -> list[str]: model_names = [] + first_model_found = False for file in files: with open(file) as f: tree = ast.parse(f.read()) @@ -52,17 +58,44 @@ def extract_model_names(files: list[str]) -> list[str]: ) if model_name: model_names.append(model_name) + first_model_found = True + if return_one_model_name_per_file and first_model_found: + logging.info(f"Found model name {model_name} in file {file}") + break # NOTE: Only take the first model_name per file to avoid disk out of space issue. return model_names +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "base_branch", + nargs="?", + default="main", + help="Base branch to compare changes with", + ) + parser.add_argument( + "--return_one_model_name_per_file", + action="store_true", + default=False, + help="Only return one model name per file.", + ) + return parser.parse_args() + + if __name__ == "__main__": """ Can pass in base branch as an argument. Defaults to 'main'. e.g. python extract_model_names.py mieb """ - base_branch = sys.argv[1] if len(sys.argv) > 1 else "main" + + args = parse_args() + + base_branch = args.base_branch changed_files = get_changed_files(base_branch) - model_names = extract_model_names(changed_files) + model_names = extract_model_names( + changed_files, + return_one_model_name_per_file=args.return_one_model_name_per_file, + ) output_file = Path(__file__).parent / "model_names.txt" with output_file.open("w") as f: f.write(" ".join(model_names)) From 4ac59bcdfbed8604b05e067b8b7df79f47b0d7a7 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Thu, 16 Jan 2025 15:11:00 +0500 Subject: [PATCH 20/49] fix: add bge-m3 `ModelMeta` (#1821) add bge --- mteb/models/bge_models.py | 235 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 235 insertions(+) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 23851b498..56efff84d 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -89,6 +89,215 @@ # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) } +bgem3_training_data = { + # source https://arxiv.org/abs/2402.03216 + "T2Retrieval": ["train"], + "DuReader": ["train"], + "MMarcoReranking": ["train"], + "CMedQAv2-reranking": ["train"], + "HotpotQA": ["train"], + "NQ": ["train"], + "MSMARCO": ["train"], + "MrTidyRetrieval": ["train"], + "MIRACLRetrieval": ["train"], + "CodeSearchNet": ["train"], + # not in mteb + # "s2orc" + # Wikipedia + # "xP3" + # "mC4" + # "CC-News" + # "MTP" + # "NLLB" + # "CCMatrix" + # TriviaQA + # COL-IEE + # PubMedQA + # SQuAD + # SimCSE + # mMARCO-ZH + # LawGPT + # NLI-zh2, LeCaRDv2, + # NLI, MultiLongDoc (their syntetic) +} + +# https://huggingface.co/BAAI/bge-m3/discussions/29 +bgem3_languages = [ + "afr_Latn", # af + # als + "amh_Ethi", # am + # an + # ar + "azj_Latn", # arz + # as + "ast_Latn", # ast + # av + # az + "azj_Latn", # azb + # ba + # bar + # bcl + "ben_Beng", # be + "bul_Cyrl", # bg + # bh + # bn + # bo + "bel_Cyrl", # bpy + # br + # bs + # bxr + "cat_Latn", # ca + # cbk + # ce + "ceb_Latn", # ceb + "ckb_Arab", # ckb + # co + # cs + # cv + # cy + "dan_Latn", # da + "deu_Latn", # de + # diq + # dsb + # dty + # dv + "ell_Grek", # el + # eml + "eng_Latn", # en + # eo + "est_Latn", # es + # et + # eu + # fa + "fin_Latn", # fi + "fra_Latn", # fr + # fy + # ga + # gd + "glg_Latn", # gl + # gn + # gom + "guj_Gujr", # gu + # gv + "heb_Hebr", # he + "hin_Deva", # hi + # hif + # hr + # hsb + # ht + # hu + # hy + # ia + # id + # ie + # ilo + # io + # is + "ita_Latn", # it + "jpn_Jpan", # ja + # jbo + # jv + # ka + # kk + # km + # kn + "kor_Hang", # ko + # krc + # ku + # kv + # kw + # ky + # la + # lb + # lez + # li + # lmo + # lo + # lt + # lv + # mai + # mg + # mhr + # min + # mk + # ml + # mn + # mr + # mrj + # ms + # mt + # mwl + # my + # myv + # mzn + # nah + # nap + # nds + # ne + # new + # nl + # nn + # no + # oc + # or + # os + # pa + # pam + # pfl + # pl + # pms + # pnb + # ps + # pt + # qu + # rm + # ro + "rus_Cyrl", # ru + # sa + # sah + # sc + # scn + # sco + # sd + # sh + # si + # sk + # sl + # so + # sq + # sr + # su + # sv + # sw + # ta + # te + # tg + "tha_Thai", # th + # tk + # tl + # tr + # tt + # tyv + # ug + "ukr_Cyrl", # uk + # ur + # uz + # vec + # vep + # vi + # vls + # vo + # wa + # war + # wuu + # xal + # xmf + # yi + # yo + # yue + "zho_Hans", # zh +] + bge_small_en_v1_5 = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, @@ -329,6 +538,32 @@ training_datasets=bge_chinese_training_data, ) +bge_m3 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-m3", + revision="5617a9f61b028005a4858fdac845db406aefb181", + ), + name="BAAI/bge-m3", + languages=bgem3_languages, + open_weights=True, + revision="5617a9f61b028005a4858fdac845db406aefb181", + release_date="2024-06-28", + n_parameters=568_000_000, + memory_usage=None, + embed_dim=4096, + license="mit", + max_tokens=8194, + reference="https://huggingface.co/BAAI/bge-m3", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=True, + public_training_code=None, + training_datasets=bgem3_training_data, +) + + bge_multilingual_gemma2 = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, From 9733d85cb66dd49187257b636c9eed2842b8f1fa Mon Sep 17 00:00:00 2001 From: github-actions Date: Thu, 16 Jan 2025 10:27:01 +0000 Subject: [PATCH 21/49] 1.29.7 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 53d56ef4b..1e1d07e99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.6" +version = "1.29.7" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From ce5cb3e859322d77f32019434f3d7cf9d72b02f5 Mon Sep 17 00:00:00 2001 From: Sam <40773225+sam-hey@users.noreply.github.com> Date: Fri, 17 Jan 2025 14:22:58 +0100 Subject: [PATCH 22/49] [v2] add similarity_fn in ModelMeta (#1759) * add dotwrapper * lint * make cleaner * add poc similarity_fn in ModelMeta * ref: rename EvaluationFunction to ScoringFunction Co-authored-by: Isaac Chung * make cos_sim default * Revert "make cleaner" This reverts commit 7d1e949f555066b08134ccacd89690e92554af30. * Revert "add dotwrapper" This reverts commit d71718b1bb6b0fc0cf378cea3b16528091fdd8d7. * lint * fix: _run_eval no co tracking * fix: bm25s * add enum to models * add mapping st sim fn name to mteb sim fn name * fix model meta use new fn for sim operators * add max_sim * fix: colbert & rm similarity_fn_name * ci: skip AfriSentiLID for now (#1785) * skip AfriSentiLID for now * skip relevant test case instead --------- Co-authored-by: Isaac Chung * test: add test for bm25s and ColBERT * lint * feat: add mapping for max_sim from pylate https://github.com/lightonai/pylate/issues/77 * test: bm25s skip * fix: MaxSim as max_sim match pylate & rm Enum in models * rm enum * update tests skip --------- Co-authored-by: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Co-authored-by: sam021313 <40773225+sam021313@users.noreply.github.com> Co-authored-by: Isaac Chung Co-authored-by: Isaac Chung --- .../evaluators/RetrievalEvaluator.py | 1 + mteb/evaluation/evaluators/model_classes.py | 16 +++++-- mteb/evaluation/evaluators/utils.py | 28 ++++++++++++ mteb/model_meta.py | 16 ++++++- mteb/models/colbert_models.py | 7 ++- mteb/models/overview.py | 9 ++-- mteb/models/sentence_transformer_wrapper.py | 6 +-- mteb/models/wrapper.py | 15 ------- tests/test_benchmark/test_models.py | 44 +++++++++++++++++++ 9 files changed, 113 insertions(+), 29 deletions(-) create mode 100644 tests/test_benchmark/test_models.py diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index be2f5af1f..9e088aacd 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -82,6 +82,7 @@ def __call__( self.top_k, task_name=self.task_name, # type: ignore instructions=instructions, + score_function="bm25", **kwargs, ) else: diff --git a/mteb/evaluation/evaluators/model_classes.py b/mteb/evaluation/evaluators/model_classes.py index b05de30d7..b2d2c54be 100644 --- a/mteb/evaluation/evaluators/model_classes.py +++ b/mteb/evaluation/evaluators/model_classes.py @@ -332,9 +332,19 @@ def _full_corpus_search( query_embeddings = torch.as_tensor(query_embeddings).to(device) sub_corpus_embeddings = torch.as_tensor(sub_corpus_embeddings).to(device) - score_function = ( - self.model.similarity if hasattr(self.model, "similarity") else cos_sim - ) + if hasattr(self.model.model, "mteb_model_meta") or hasattr( + self.model, "similarity" + ): + score_function = ( + self.model.similarity + if hasattr(self.model, "similarity") + else self.model.model.mteb_model_meta.get_similarity_function() + ) + else: + logger.warning( + "The model does not provide `mteb_model_meta`; defaulting to the cosine similarity function." + ) + score_function = cos_sim with torch.inference_mode(): scores = score_function(query_embeddings, sub_corpus_embeddings) diff --git a/mteb/evaluation/evaluators/utils.py b/mteb/evaluation/evaluators/utils.py index e01e0ec46..14ca673ce 100644 --- a/mteb/evaluation/evaluators/utils.py +++ b/mteb/evaluation/evaluators/utils.py @@ -70,6 +70,34 @@ def _cos_sim_core(a_tensor, b_tensor): return _cos_sim_core(a, b) +def max_sim(a: np.ndarray, b: np.ndarray) -> np.ndarray: + """Computes the max-similarity max_sim(a[i], b[j]) for all i and j. + Works with a Tensor of the shape (batch_size, num_tokens, token_dim) + + Return: + Matrix with res[i][j] = max_sim(a[i], b[j]) + """ # noqa: D402 + if not isinstance(a, torch.Tensor): + a = torch.tensor(a, dtype=torch.float32) + + if not isinstance(b, torch.Tensor): + b = torch.tensor(b, dtype=torch.float32) + + if len(a.shape) == 2: + a = a.unsqueeze(0) + + if len(b.shape) == 2: + b = b.unsqueeze(0) + + scores = torch.einsum( + "ash,bth->abst", + a, + b, + ) + + return scores.max(axis=-1).values.sum(axis=-1) + + def dot_score(a: torch.Tensor, b: torch.Tensor): """Computes the dot-product dot_prod(a[i], b[j]) for all i and j. :return: Matrix with res[i][j] = dot_prod(a[i], b[j]) diff --git a/mteb/model_meta.py b/mteb/model_meta.py index 1754ab4bb..bb063e7ba 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -4,11 +4,13 @@ from functools import partial from typing import TYPE_CHECKING, Any, Callable, Literal +import numpy as np from pydantic import BaseModel, ConfigDict from mteb.abstasks.AbsTask import AbsTask from mteb.abstasks.TaskMetadata import STR_DATE, STR_URL from mteb.encoder_interface import Encoder +from mteb.evaluation.evaluators.utils import cos_sim, dot_score, max_sim from .languages import ISO_LANGUAGE_SCRIPT @@ -30,7 +32,6 @@ "PyLate", "ColBERT", ] -DISTANCE_METRICS = Literal["cosine", "max_sim", "dot"] def sentence_transformers_loader( @@ -51,6 +52,9 @@ def get_loader_name( return loader.__name__ +DISTANCE_METRICS = Literal["cosine", "MaxSim", "dot"] + + class ModelMeta(BaseModel): """The model metadata object. @@ -106,6 +110,16 @@ class ModelMeta(BaseModel): superseded_by: str | None = None citation: str | None = None + def get_similarity_function(self) -> Callable[[np.ndarray, np.ndarray], np.ndarray]: + if self.similarity_fn_name == "cosine": + return cos_sim + elif self.similarity_fn_name == "dot": + return dot_score + elif self.similarity_fn_name == "MaxSim": + return max_sim + elif self.similarity_fn_name is None: + raise ValueError("Similarity function not specified.") + def to_dict(self): dict_repr = self.model_dump() loader = dict_repr.pop("loader", None) diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index 8753791bf..6ce7ca6fb 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -100,10 +100,13 @@ def encode( ) logger.info(f"Encoding {len(sentences)} sentences.") + if "request_qid" in kwargs: + kwargs.pop("request_qid") pred = self.model.encode( sentences, prompt_name=prompt_name, is_query=True if prompt_type == PromptType.query else False, + convert_to_tensor=True, **kwargs, ) @@ -158,7 +161,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: max_tokens=180, # Reduced for Benchmarking - see ColBERT paper embed_dim=None, # Bag of Embeddings (128) for each token license="mit", - similarity_fn_name="max_sim", + similarity_fn_name="MaxSim", framework=["PyLate", "ColBERT"], reference="https://huggingface.co/colbert-ir/colbertv2.0", use_instructions=False, @@ -209,7 +212,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: max_tokens=8192, embed_dim=None, # Bag of Embeddings (128) for each token license="cc-by-nc-4.0", - similarity_fn_name="max_sim", + similarity_fn_name="MaxSim", framework=["PyLate", "ColBERT"], reference="https://huggingface.co/jinaai/jina-colbert-v2", use_instructions=False, diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 4e19bed19..e9774cacd 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -157,9 +157,12 @@ def get_model(model_name: str, revision: str | None = None, **kwargs: Any) -> En model = meta.load_model(**kwargs) # If revision not available in the modelmeta, try to extract it from sentence-transformers - if meta.revision is None and isinstance(model, SentenceTransformer): - _meta = model_meta_from_sentence_transformers(model) - meta.revision = _meta.revision if _meta.revision else meta.revision + if isinstance(model.model, SentenceTransformer): + _meta = model_meta_from_sentence_transformers(model.model) + if meta.revision is None: + meta.revision = _meta.revision if _meta.revision else meta.revision + if not meta.similarity_fn_name: + meta.similarity_fn_name = _meta.similarity_fn_name model.mteb_model_meta = meta # type: ignore return model diff --git a/mteb/models/sentence_transformer_wrapper.py b/mteb/models/sentence_transformer_wrapper.py index 9ec25a989..bb4746783 100644 --- a/mteb/models/sentence_transformer_wrapper.py +++ b/mteb/models/sentence_transformer_wrapper.py @@ -21,7 +21,6 @@ def __init__( model: str | SentenceTransformer | CrossEncoder, revision: str | None = None, model_prompts: dict[str, str] | None = None, - similarity_fn_name: str | None = None, **kwargs, ) -> None: """Wrapper for SentenceTransformer models. @@ -33,7 +32,6 @@ def __init__( First priority is given to the composed prompt of task name + prompt type (query or passage), then to the specific task prompt, then to the composed prompt of task type + prompt type, then to the specific task type prompt, and finally to the specific prompt type. - similarity_fn_name: A similarity function to use. **kwargs: Additional arguments to pass to the SentenceTransformer model. """ if isinstance(model, str): @@ -61,9 +59,7 @@ def __init__( if isinstance(self.model, CrossEncoder): self.predict = self.handle_instructions_predict - if similarity_fn_name: - self.similarity = self.get_similarity_function(similarity_fn_name) - elif hasattr(self.model, "similarity") and callable(self.model.similarity): + if hasattr(self.model, "similarity") and callable(self.model.similarity): self.similarity = self.model.similarity def encode( diff --git a/mteb/models/wrapper.py b/mteb/models/wrapper.py index 76b31ba52..956071d3d 100644 --- a/mteb/models/wrapper.py +++ b/mteb/models/wrapper.py @@ -3,12 +3,9 @@ import logging from typing import Callable, get_args -import numpy as np - import mteb from mteb.abstasks.TaskMetadata import TASK_TYPE from mteb.encoder_interface import PromptType -from mteb.evaluation.evaluators.utils import cos_sim, dot_score logger = logging.getLogger(__name__) @@ -67,18 +64,6 @@ def get_prompt_name( ) return None - @staticmethod - def get_similarity_function( - similarity_fn_name: str, - ) -> Callable[[np.ndarray, np.ndarray], np.ndarray]: - if similarity_fn_name == "cosine": - return cos_sim - if similarity_fn_name == "dot": - return dot_score - raise ValueError( - "Invalid similarity function. Should be one of ['cosine', 'dot']" - ) - @staticmethod def validate_task_to_prompt_name( task_to_prompt_name: dict[str, str] | None, diff --git a/tests/test_benchmark/test_models.py b/tests/test_benchmark/test_models.py new file mode 100644 index 000000000..ee5bed091 --- /dev/null +++ b/tests/test_benchmark/test_models.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +import pytest + +import mteb +from mteb import MTEB +from mteb.abstasks import AbsTask + +from .mock_tasks import MockRetrievalTask + + +@pytest.mark.parametrize("model", ["colbert-ir/colbertv2.0"]) +@pytest.mark.parametrize("task", [MockRetrievalTask()]) +def test_colbert_model_e2e(task: AbsTask, model: str): + pytest.importorskip("pylate", reason="pylate not installed") + eval_splits = ["test"] + model = mteb.get_model(model) + evaluation = MTEB(tasks=[task]) + + results = evaluation.run( + model, + eval_splits=eval_splits, + corpus_chunk_size=500, + ) + result = results[0] + + assert result.scores["test"][0]["ndcg_at_1"] == 1.0 + + +def test_bm25s_e2e(): + # fails for dataset smaller then 1000 + pytest.importorskip("bm25s", reason="bm25s not installed") + pytest.importorskip("Stemmer", reason="PyStemmer not installed") + + model = mteb.get_model("bm25s") + tasks = mteb.get_tasks(tasks=["NFCorpus"]) + eval_splits = ["test"] + + evaluation = MTEB(tasks=tasks) + + results = evaluation.run(model, eval_splits=eval_splits) + result = results[0] + + assert result.scores["test"][0]["ndcg_at_1"] == 0.42879 From 74b495cd197846af91d6425891d1f9156cd1db68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 17 Jan 2025 14:46:03 +0100 Subject: [PATCH 23/49] fix: Added Chinese Stella models (#1824) Added Chinese Stella models --- mteb/models/stella_models.py | 109 +++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index a738f4461..c7f9aad9f 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -53,3 +53,112 @@ framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", ) + +stella_large_zh_v3_1792d = ModelMeta( + name="dunzhang/stella-large-zh-v3-1792d", + languages=["zho_Hans"], + open_weights=True, + revision="d5d39eb8cd11c80a63df53314e59997074469f09", + release_date="2024-02-17", + n_parameters=None, # can't see on model card + memory_usage=None, + embed_dim=1792, + license="not specified", + max_tokens=512, + reference="https://huggingface.co/dunzhang/stella-large-zh-v3-1792d", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by="dunzhang/stella-mrl-large-zh-v3.5-1792d", + adapted_from=None, + public_training_code=False, + public_training_data=True, + training_datasets={ + # Not in MTEB: + # - infgrad/dialogue_rewrite_llm + # - infgrad/retrieval_data_llm + }, +) + +stella_base_zh_v3_1792d = ModelMeta( + name="infgrad/stella-base-zh-v3-1792d", + languages=["zho_Hans"], + open_weights=True, + revision="82254892a0fba125aa2abf3a4800d2dd12821343", + release_date="2024-02-17", + n_parameters=None, # can't see on model card + memory_usage=None, + embed_dim=1792, + license="mit", + max_tokens=512, + reference="https://huggingface.co/infgrad/stella-base-zh-v3-1792d", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=False, + public_training_data=True, + training_datasets={ + # Not in MTEB: + # - infgrad/dialogue_rewrite_llm + # - infgrad/retrieval_data_llm + }, +) + + +stella_mrl_large_zh_v3_5_1792d = ModelMeta( + name="dunzhang/stella-mrl-large-zh-v3.5-1792d", + languages=["zho_Hans"], + open_weights=True, + revision="17bb1c32a93a8fc5f6fc9e91d5ea86da99983cfe", + release_date="2024-02-27", + n_parameters=326 * 1e6, + memory_usage=None, + embed_dim=1792, + license="mit", + max_tokens=512, + reference="https://huggingface.co/dunzhang/stella-large-zh-v3-1792d", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from="dunzhang/stella-large-zh-v3-1792d", + public_training_code=False, + public_training_data=True, + training_datasets=None, # Not specified +) + +zpoint_large_embedding_zh = ModelMeta( + name="iampanda/zpoint_large_embedding_zh", + languages=["zho_Hans"], + open_weights=True, + revision="b1075144f440ab4409c05622c1179130ebd57d03", + release_date="2024-06-04", + n_parameters=326 * 1e6, + memory_usage=None, + embed_dim=1792, + license="mit", + max_tokens=512, + reference="https://huggingface.co/iampanda/zpoint_large_embedding_zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from="dunzhang/stella-mrl-large-zh-v3.5-1792d", + public_training_code=False, + public_training_data=True, + training_datasets={ + # It's a bit unclear what they have trained on to be honest, because they don't list all + # And they also have some rather cryptic description of their training procedure, but at + # Least they disclose that they have trained on these: + "MIRACLRetrieval": ["train"], + "MIRACLReranking": ["train"], + "DuRetrieval": ["train"], + "T2Retrieval": ["train"], + "MultiLongDocRetrieval": ["train"], + # Not in MTEB: + # - Shitao/bge-reranker-data + # - FreedomIntelligence/Huatuo26M-Lite + }, +) From 96420a2ad39a61aafb34630f5c6c5a50a3717fdc Mon Sep 17 00:00:00 2001 From: Sam <40773225+sam-hey@users.noreply.github.com> Date: Fri, 17 Jan 2025 14:46:56 +0100 Subject: [PATCH 24/49] fix: bm25s (#1827) Co-authored-by: sam021313 <40773225+sam021313@users.noreply.github.com> --- mteb/evaluation/evaluators/RetrievalEvaluator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py index 251498d6b..77b8ecc0a 100644 --- a/mteb/evaluation/evaluators/RetrievalEvaluator.py +++ b/mteb/evaluation/evaluators/RetrievalEvaluator.py @@ -477,6 +477,7 @@ def __call__( corpus, queries, self.top_k, + score_function="bm25", task_name=self.task_name, # type: ignore ) else: From 3b2d074efbe9d665171071dab63796f3ae783802 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 17 Jan 2025 14:47:22 +0100 Subject: [PATCH 25/49] fix: Added way more training dataset annotations (#1765) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: Leaderboard: `K` instead of `M` Fixes #1752 * format * fixed existing annotations to refer to task name instead of hf dataset * added annotation to nvidia * added voyage * added uae annotations * Added stella annotations * sentence trf models * added salesforce and e5 * jina * bge + model2vec * added llm2vec annotations * add jasper * format * format * Updated annotations and moved jina models * fix: add even more training dataset annotations (#1793) * fix: update max tokens for OpenAI (#1772) update max tokens * ci: skip AfriSentiLID for now (#1785) * skip AfriSentiLID for now * skip relevant test case instead --------- Co-authored-by: Isaac Chung * 1.28.7 Automatically generated by python-semantic-release * ci: fix model loading test (#1775) * pass base branch into the make command as an arg * test a file that has custom wrapper * what about overview * just dont check overview * revert instance check * explicitly omit overview and init * remove test change * try on a lot of models * revert test model file --------- Co-authored-by: Isaac Chung * feat: Update task filtering, fixing bug which included cross-lingual tasks in overly many benchmarks (#1787) * feat: Update task filtering, fixing bug on MTEB - Updated task filtering adding exclusive_language_filter and hf_subset - fix bug in MTEB where cross-lingual splits were included - added missing language filtering to MTEB(europe, beta) and MTEB(indic, beta) The following code outlines the problems: ```py import mteb from mteb.benchmarks import MTEB_ENG_CLASSIC task = [t for t in MTEB_ENG_CLASSIC.tasks if t.metadata.name == "STS22"][0] # was eq. to: task = mteb.get_task("STS22", languages=["eng"]) task.hf_subsets # correct filtering to English datasets: # ['en', 'de-en', 'es-en', 'pl-en', 'zh-en'] # However it should be: # ['en'] # with the changes it is: task = [t for t in MTEB_ENG_CLASSIC.tasks if t.metadata.name == "STS22"][0] task.hf_subsets # ['en'] # eq. to task = mteb.get_task("STS22", hf_subsets=["en"]) # which you can also obtain using the exclusive_language_filter (though not if there was multiple english splits): task = mteb.get_task("STS22", languages=["eng"], exclusive_language_filter=True) ``` * format * remove "en-ext" from AmazonCounterfactualClassification * fixed mteb(deu) * fix: simplify in a few areas * fix: Add gritlm * 1.29.0 Automatically generated by python-semantic-release * fix: Added more annotations! * fix: Added C-MTEB (#1786) Added C-MTEB * 1.29.1 Automatically generated by python-semantic-release * docs: Add contact to MMTEB benchmarks (#1796) * Add myself to MMTEB benchmarks * lint * fix: loading pre 11 (#1798) * fix loading pre 11 * add similarity * lint * run all task types * 1.29.2 Automatically generated by python-semantic-release * fix: allow to load no revision available (#1801) * fix allow to load no revision available * lint * add require_model_meta to leaderboard * lint * 1.29.3 Automatically generated by python-semantic-release --------- Co-authored-by: Roman Solomatin Co-authored-by: Isaac Chung Co-authored-by: Isaac Chung Co-authored-by: github-actions Co-authored-by: Márton Kardos --------- Co-authored-by: Roman Solomatin Co-authored-by: Isaac Chung Co-authored-by: Isaac Chung Co-authored-by: github-actions Co-authored-by: Márton Kardos --- mteb/models/bge_models.py | 144 +++---- mteb/models/e5_instruct.py | 8 +- mteb/models/e5_models.py | 148 ++------ mteb/models/gritlm_models.py | 11 +- mteb/models/jasper_models.py | 8 +- mteb/models/jina_models.py | 94 ++++- mteb/models/llm2vec_models.py | 49 +++ mteb/models/misc_models.py | 152 +++++--- mteb/models/model2vec_models.py | 108 ++---- mteb/models/nvidia_models.py | 54 +++ mteb/models/ru_sentence_models.py | 43 ++- mteb/models/salesforce_models.py | 25 ++ mteb/models/sentence_transformers_models.py | 395 +++++--------------- mteb/models/stella_models.py | 6 + mteb/models/uae_models.py | 9 + mteb/models/voyage_models.py | 27 ++ 16 files changed, 609 insertions(+), 672 deletions(-) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 56efff84d..05547d6a0 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -298,6 +298,60 @@ "zho_Hans", # zh ] +bge_m_training_data = { + # source: https://arxiv.org/pdf/2402.03216 + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "LeCaRDv2": ["train"], + "CMedQAv1-reranking": ["train"], + "CMedQAv2-reranking": ["train"], + "MrTidyRetrieval": ["train"], + "T2Reranking": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + # + synthetic data +} + +bge_training_data = { + # source: https://data.baai.ac.cn/details/BAAI-MTP + "NQ": ["test"], + "NQHardNegatives": ["test"], + "AmazonReviewsClassification": [ + "validation", + "test", + ], # assumed from: amazon_reviews_multi + "MLQARetrieval": [ + "validation", + "test", + ], # assumed from mlqa (question, context) + # not in mteb + # Dataset Pairs + # wudao (title, passage) + # cmrc2018 (query, context) + # dureader (query, context) + # simclue (sentence_a, sentence_b) + # csl (title, abstract) + # amazon_reviews_multi (title, body) + # wiki_atomic_edits (base_sentence, edited_sentence) + # mlqa (question, context) + # xlsum (title, summary) (title, text) + # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further + # "wikipedia": [], # title + section title, passage + # "reddit": [], # title, body + # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) + # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) +} + bge_small_en_v1_5 = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, @@ -321,35 +375,7 @@ use_instructions=True, public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken - training_datasets={ - # source: https://data.baai.ac.cn/details/BAAI-MTP - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], # assumed from: amazon_reviews_multi - "MLQARetrieval": [ - "validation", - "test", - ], # assumed from mlqa (question, context) - # not in mteb - # Dataset Pairs - # wudao (title, passage) - # cmrc2018 (query, context) - # dureader (query, context) - # simclue (sentence_a, sentence_b) - # csl (title, abstract) - # amazon_reviews_multi (title, body) - # wiki_atomic_edits (base_sentence, edited_sentence) - # mlqa (question, context) - # xlsum (title, summary) (title, text) - # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further - # "wikipedia": [], # title + section title, passage - # "reddit": [], # title, body - # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) - # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) - }, + training_datasets=bge_training_data, ) bge_base_en_v1_5 = ModelMeta( @@ -375,35 +401,7 @@ use_instructions=True, public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken - training_datasets={ - # source: https://data.baai.ac.cn/details/BAAI-MTP - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], # assumed from: amazon_reviews_multi - "MLQARetrieval": [ - "validation", - "test", - ], # assumed from mlqa (question, context) - # not in mteb - # Dataset Pairs - # wudao (title, passage) - # cmrc2018 (query, context) - # dureader (query, context) - # simclue (sentence_a, sentence_b) - # csl (title, abstract) - # amazon_reviews_multi (title, body) - # wiki_atomic_edits (base_sentence, edited_sentence) - # mlqa (question, context) - # xlsum (title, summary) (title, text) - # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further - # "wikipedia": [], # title + section title, passage - # "reddit": [], # title, body - # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) - # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) - }, + training_datasets=bge_training_data, ) bge_large_en_v1_5 = ModelMeta( @@ -429,35 +427,7 @@ use_instructions=True, public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken - training_datasets={ - # source: https://data.baai.ac.cn/details/BAAI-MTP - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], # assumed from: amazon_reviews_multi - "MLQARetrieval": [ - "validation", - "test", - ], # assumed from mlqa (question, context) - # not in mteb - # Dataset Pairs - # wudao (title, passage) - # cmrc2018 (query, context) - # dureader (query, context) - # simclue (sentence_a, sentence_b) - # csl (title, abstract) - # amazon_reviews_multi (title, body) - # wiki_atomic_edits (base_sentence, edited_sentence) - # mlqa (question, context) - # xlsum (title, summary) (title, text) - # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further - # "wikipedia": [], # title + section title, passage - # "reddit": [], # title, body - # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) - # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) - }, + training_datasets=bge_training_data, ) bge_small_zh_v1_5 = ModelMeta( diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index f26d78ed6..182a6ea4b 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -6,7 +6,7 @@ from mteb.model_meta import ModelMeta -from .e5_models import E5_PAPER_RELEASE_DATE, XLMR_LANGUAGES +from .e5_models import E5_PAPER_RELEASE_DATE, E5_TRAINING_DATA, XLMR_LANGUAGES from .instruct_wrapper import instruct_wrapper MISTRAL_LANGUAGES = ["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"] @@ -40,6 +40,9 @@ embed_dim=1024, license="mit", max_tokens=514, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_mistral = ModelMeta( @@ -69,4 +72,7 @@ embed_dim=4096, license="mit", max_tokens=32768, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index 97b117002..9537824e5 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -113,6 +113,19 @@ PromptType.passage.value: "passage: ", } +E5_TRAINING_DATA = { + # from 4.2 in https://arxiv.org/pdf/2212.03533 + # also pre-training data from a variety of sources (stackexchange, semantic scholar, reddit, CC, ...) + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on +} + e5_mult_small = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, @@ -134,26 +147,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - # table 1: - # Wikipedia 150M - # mC4 160M - # Multilingual CC News 160M - # NLLB 160M - # Reddit 160M - # S2ORC 50M - # Stackexchange 50M - # xP3 80M - # Misc. SBERT Data 10M - # ---- - # from Misc. SBERT Data 10M: - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_mult_base = ModelMeta( @@ -176,26 +172,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2402.05672 - # table 1: - # Wikipedia 150M - # mC4 160M - # Multilingual CC News 160M - # NLLB 160M - # Reddit 160M - # S2ORC 50M - # Stackexchange 50M - # xP3 80M - # Misc. SBERT Data 10M - # ---- - # from Misc. SBERT Data 10M: - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_mult_large = ModelMeta( @@ -219,26 +198,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2402.05672 - # table 1: - # Wikipedia 150M - # mC4 160M - # Multilingual CC News 160M - # NLLB 160M - # Reddit 160M - # S2ORC 50M - # Stackexchange 50M - # xP3 80M - # Misc. SBERT Data 10M - # ---- - # from Misc. SBERT Data 10M: - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_small_v2 = ModelMeta( @@ -261,14 +223,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_small = ModelMeta( @@ -292,14 +249,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_base_v2 = ModelMeta( @@ -325,14 +277,9 @@ use_instructions=True, superseded_by=None, adapted_from=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_eng_large_v2 = ModelMeta( @@ -358,14 +305,9 @@ use_instructions=True, superseded_by=None, adapted_from=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_large = ModelMeta( @@ -391,14 +333,9 @@ use_instructions=True, superseded_by="intfloat/e5-large-v2", adapted_from=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) e5_base = ModelMeta( @@ -424,12 +361,7 @@ use_instructions=True, superseded_by="intfloat/e5-base-v2", adapted_from=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find - training_datasets={ - # source: https://arxiv.org/pdf/2212.03533 - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], # dev? - }, + public_training_data=False, + public_training_code=False, + training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index 91acafa26..a4f5befd1 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -5,6 +5,7 @@ from mteb.model_meta import ModelMeta +from .e5_models import E5_TRAINING_DATA from .instruct_wrapper import instruct_wrapper logger = logging.getLogger(__name__) @@ -29,7 +30,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: open_weights=True, revision="13f00a0e36500c80ce12870ea513846a066004af", release_date="2024-02-15", - training_datasets={"GritLM/tulu2": ["train"]}, n_parameters=7_240_000_000, memory_usage=None, embed_dim=4096, @@ -39,6 +39,10 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, + training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data + public_training_code=True, # https://github.com/ContextualAI/gritlm + public_training_data=False, ) gritlm8x7b = ModelMeta( loader=partial( # type: ignore @@ -50,7 +54,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: ), name="GritLM/GritLM-8x7B", languages=["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"], - training_datasets={"GritLM/tulu2": ["train"]}, open_weights=True, revision="7f089b13e3345510281733ca1e6ff871b5b4bc76", release_date="2024-02-15", @@ -63,4 +66,8 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, + training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data + public_training_code=True, # https://github.com/ContextualAI/gritlm + public_training_data=False, ) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index 60fa4f697..0062df2ac 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -13,6 +13,7 @@ from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta +from .nvidia_models import nvidia_training_datasets from .wrapper import Wrapper logger = logging.getLogger(__name__) @@ -90,7 +91,8 @@ def encode( use_instructions=True, adapted_from=None, superseded_by=None, - training_datasets={ - "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], - }, + training_datasets=nvidia_training_datasets, # "In jasper model the teacher model is nvidia/NV-Embed-v2", source https://huggingface.co/infgrad/jasper_en_vision_language_v1 + # "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], + public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index 122f19065..728ffaa98 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -214,7 +214,7 @@ def encode( open_weights=True, revision="215a6e121fa0183376388ac6b1ae230326bfeaed", release_date="2024-09-18", # official release date - n_parameters=572 * 1e6, + n_parameters=int(572 * 1e6), max_tokens=8194, embed_dim=4096, license="cc-by-nc-4.0", @@ -222,4 +222,96 @@ def encode( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, reference="https://huggingface.co/jinaai/jina-embeddings-v3", + training_datasets=None, + public_training_code=False, + public_training_data=False, +) + + +jina_embeddings_v2_base_en = ModelMeta( + name="jinaai/jina-embeddings-v2-base-en", + languages=["eng-Latn"], + open_weights=True, + revision="6e85f575bc273f1fd840a658067d0157933c83f0", + release_date="2023-09-27", + n_parameters=137_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-en", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC +) + +jina_embeddings_v2_small_en = ModelMeta( + name="jinaai/jina-embeddings-v2-small-en", + languages=["eng-Latn"], + open_weights=True, + revision="796cff318cdd4e5fbe8b7303a1ef8cbec36996ef", + release_date="2023-09-27", + n_parameters=32_700_000, + memory_usage=None, + embed_dim=512, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/jinaai/jina-embeddings-v2-small-en", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} +) + +jina_embedding_b_en_v1 = ModelMeta( + name="jinaai/jina-embedding-b-en-v1", + languages=["eng-Latn"], + open_weights=True, + revision="aa0645035294a8c0607ce5bb700aba982cdff32c", + release_date="2023-07-07", + n_parameters=110_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/jinaai/jina-embedding-b-en-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by="jinaai/jina-embeddings-v2-base-en", + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} +) + +jina_embedding_s_en_v1 = ModelMeta( + name="jinaai/jina-embedding-s-en-v1", + languages=["eng-Latn"], + open_weights=True, + revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", + release_date="2023-07-07", + n_parameters=35_000_000, + memory_usage=None, + embed_dim=512, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/jinaai/jina-embedding-s-en-v1", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by="jinaai/jina-embeddings-v2-small-en", + adapted_from=None, + training_datasets=None, + public_training_code=False, + public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} ) diff --git a/mteb/models/llm2vec_models.py b/mteb/models/llm2vec_models.py index e962289aa..cbc42fe5e 100644 --- a/mteb/models/llm2vec_models.py +++ b/mteb/models/llm2vec_models.py @@ -20,6 +20,31 @@ def llm2vec_instruction(instruction): return instruction +llm2vec_supervised_training_data = { + # source, section g1: https://arxiv.org/pdf/2404.05961 + # splits assumed but unkown + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "MrTidyRetrieval": ["train"], + "T2Reranking": ["train"], +} + + class LLM2VecWrapper(Wrapper): def __init__( self, @@ -100,6 +125,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_llama3_8b_unsupervised = ModelMeta( @@ -124,6 +152,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) @@ -149,6 +180,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_mistral7b_unsupervised = ModelMeta( @@ -173,6 +207,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) llm2vec_llama2_7b_supervised = ModelMeta( @@ -197,6 +234,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_llama2_7b_unsupervised = ModelMeta( @@ -221,6 +261,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) llm2vec_sheared_llama_supervised = ModelMeta( @@ -245,6 +288,9 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets=llm2vec_supervised_training_data, ) llm2vec_sheared_llama_unsupervised = ModelMeta( @@ -269,4 +315,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, + public_training_code=True, + public_training_data=True, + training_datasets={}, ) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 5e8fcae0a..88dad0050 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -5,6 +5,10 @@ import torch from mteb.model_meta import ModelMeta, sentence_transformers_loader +from mteb.models.e5_models import E5_TRAINING_DATA + +from .bge_models import bge_m_training_data, bge_training_data +from .sentence_transformers_models import sent_trf_training_dataset Haon_Chen__speed_embedding_7b_instruct = ModelMeta( name="Haon-Chen/speed-embedding-7b-instruct", @@ -113,38 +117,47 @@ similarity_fn_name="cosine", use_instructions=None, training_datasets={ - "s2orc": ["train"], - "flax-sentence-embeddings/stackexchange_title_body_jsonl": ["train"], - "flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl": [ - "train" - ], - "flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl": [ - "train" - ], - "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl": [ - "train" - ], - "sentence-transformers/reddit-title-body": ["train"], - "msmarco": ["train"], - "gooaq": ["train"], - "yahoo_answers_topics": ["train"], - "code_search_net": ["train"], - "search_qa": ["train"], - "eli5": ["train"], - "snli": ["train"], - "multi_nli": ["train"], - "wikihow": ["train"], - "natural_questions": ["train"], - "trivia_qa": ["train"], - "embedding-data/sentence-compression": ["train"], - "embedding-data/flickr30k-captions": ["train"], - "embedding-data/altlex": ["train"], - "embedding-data/simple-wiki": ["train"], - "embedding-data/QQP": ["train"], - "embedding-data/SPECTER": ["train"], - "embedding-data/PAQ_pairs": ["train"], - "embedding-data/WikiAnswers": ["train"], - "sentence-transformers/embedding-training-data": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_title_body_jsonl": ["train"], + # "flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl": [ + # "train" + # ], + # "flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl": [ + # "train" + # ], + # "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl": [ + # "train" + # ], + # "sentence-transformers/reddit-title-body": ["train"], + # "msmarco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], + # "sentence-transformers/embedding-training-data": ["train"], }, adapted_from="hum-lodestone-v1", superseded_by=None, @@ -189,7 +202,8 @@ reference="https://huggingface.co/BeastyZ/e5-R-mistral-7b", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"BeastyZ/E5-R": ["train"]}, + training_datasets=E5_TRAINING_DATA, + # not MTEB: {"BeastyZ/E5-R": ["train"]}, adapted_from="/ConRetriever/public_weight_mistral", superseded_by=None, ) @@ -286,13 +300,14 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=True, + public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Bulbasaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is GTE-tiny where training data is unknown + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/dwsdwass", superseded_by=None, ) @@ -308,13 +323,14 @@ embed_dim=384, license="mit", open_weights=True, - public_training_data=True, + public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Ivysaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is GTE-tiny where training data is unknown + # not MTEB: {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/jhjghjgh", superseded_by=None, ) @@ -336,7 +352,8 @@ reference="https://huggingface.co/Mihaiii/Squirtle", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=bge_training_data, # source model is bge-base-en-v1.5 + # not MTEB: {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test21", superseded_by=None, ) @@ -358,7 +375,8 @@ reference="https://huggingface.co/Mihaiii/Venusaur", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=None, # source model is unkown + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test14", superseded_by=None, ) @@ -380,7 +398,8 @@ reference="https://huggingface.co/Mihaiii/Wartortle", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Mihaiii/qa-assistant": ["train"]}, + training_datasets=bge_training_data, # distill from bge-base-en-v1.5 + # {"Mihaiii/qa-assistant": ["train"]}, adapted_from="Mihaiii/test22", superseded_by=None, ) @@ -468,7 +487,7 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets={}, # not in MTEB: {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="aubmindlab/bert-base-arabertv02", superseded_by=None, ) @@ -490,7 +509,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not in MTEB + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", superseded_by=None, ) @@ -512,7 +533,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, # derived from + # not in MTEB: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", superseded_by=None, ) @@ -534,7 +557,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=None, # derived from labSE + # as well as: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="sentence-transformers/LaBSE", superseded_by=None, ) @@ -556,7 +581,9 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not in MTEB: + # {"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="tomaarsen/mpnet-base-all-nli-triplet", superseded_by=None, ) @@ -578,7 +605,7 @@ reference="https://huggingface.co/Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, + training_datasets={}, # not in MTEB: "Omartificial-Intelligence-Space/Arabic-NLi-Triplet": ["train"]}, adapted_from="UBC-NLP/MARBERTv2", superseded_by=None, ) @@ -710,7 +737,8 @@ reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.4", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"manu/embedding_data_v2_100k": ["train"]}, + training_datasets=None, + # Not in MTEB: {"manu/embedding_data_v2_100k": ["train"]}, adapted_from="croissantllm/CroissantCool-v0.2", superseded_by=None, ) @@ -1356,7 +1384,8 @@ reference="https://huggingface.co/aari1995/German_Semantic_STS_V2", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"stsb_multi_mt": ["train"]}, + training_datasets=None, # couldn't figure out the source model + # {"stsb_multi_mt": ["train"]}, adapted_from="/content/drive/MyDrive/Stanford_NLU/Project/false_friends/gbert_large_sts_only", superseded_by=None, ) @@ -1472,18 +1501,18 @@ reference="https://huggingface.co/deepvk/USER-bge-m3", similarity_fn_name="cosine", use_instructions=None, - training_datasets={ - "deepvk/ru-HNP": ["train"], - "deepvk/ru-WANLI": ["train"], - "Shitao/bge-m3-data": ["train"], - "RussianNLP/russian_super_glue": ["train"], - "reciTAL/mlsum": ["train"], - "Milana/russian_keywords": ["train"], - "IlyaGusev/gazeta": ["train"], - "d0rj/gsm8k-ru": ["train"], - "bragovo/dsum_ru": ["train"], - "CarlBrendt/Summ_Dialog_News": ["train"], - }, + training_datasets=bge_m_training_data, # derived from. + # not in MTEB: + # "deepvk/ru-HNP": ["train"], + # "deepvk/ru-WANLI": ["train"], + # "Shitao/bge-m3-data": ["train"], + # "RussianNLP/russian_super_glue": ["train"], + # "reciTAL/mlsum": ["train"], + # "Milana/russian_keywords": ["train"], + # "IlyaGusev/gazeta": ["train"], + # "d0rj/gsm8k-ru": ["train"], + # "bragovo/dsum_ru": ["train"], + # "CarlBrendt/Summ_Dialog_News": ["train"], adapted_from="USER-bge-m3", superseded_by=None, ) @@ -1613,7 +1642,8 @@ reference="https://huggingface.co/shibing624/text2vec-base-multilingual", similarity_fn_name="cosine", use_instructions=None, - training_datasets={"shibing624/nli-zh-all": ["train"]}, + training_datasets=sent_trf_training_dataset, + # not MTEB: {"shibing624/nli-zh-all": ["train"]}, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", superseded_by=None, ) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 37da53345..1a58bbf8e 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -9,6 +9,7 @@ from mteb.model_meta import ModelMeta +from .bge_models import bge_training_data from .wrapper import Wrapper logger = logging.getLogger(__name__) @@ -72,21 +73,10 @@ def encode( reference="https://huggingface.co/minishlab/M2V_base_glove_subword", use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, superseded_by=None, + training_datasets=bge_training_data, # distilled + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) @@ -110,20 +100,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, + training_datasets=bge_training_data, # distilled + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) m2v_base_output = ModelMeta( @@ -146,20 +125,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, + training_datasets=bge_training_data, # distilled + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) m2v_multilingual_output = ModelMeta( @@ -182,8 +150,9 @@ def encode( use_instructions=False, adapted_from="sentence-transformers/LaBSE", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model + training_datasets=None, + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) potion_base_2m = ModelMeta( @@ -206,20 +175,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, + training_datasets=bge_training_data, # distilled + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) potion_base_4m = ModelMeta( @@ -242,20 +200,9 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, + training_datasets=bge_training_data, # distilled + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) potion_base_8m = ModelMeta( @@ -278,18 +225,7 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, - public_training_data=True, - public_training_code=None, # distilled model - training_datasets={ # same as adapted from - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], - "MLQARetrieval": [ - "validation", - "test", - ], - }, + training_datasets=bge_training_data, # distilled + public_training_code=True, # https://github.com/MinishLab/model2vec + public_training_data=False, ) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 72274b41d..6bf4e041a 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -72,6 +72,54 @@ def encode( return embeddings +nvidia_training_datasets = { + # source: https://arxiv.org/pdf/2405.17428 + "ArguAna": ["train"], + "ArguAna-PL": ["train"], + "NanoArguAnaRetrieval": ["train"], + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FiQA2018": ["train"], + "FiQA2018-PL": ["train"], # translation not trained on + "STS12": ["train"], + "STS22": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "ImdbClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringS2S.v2": ["train"], + "BiorxivClusteringP2P": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwentyNewsgroupsClustering.v2": ["train"], + "STSBenchmark": ["train"], + "STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on +} NV_embed_v2 = ModelMeta( loader=partial( # type: ignore NvEmbedWrapper, @@ -92,6 +140,9 @@ def encode( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=nvidia_training_datasets, + public_training_code=None, + public_training_data=True, ) NV_embed_v1 = ModelMeta( @@ -114,4 +165,7 @@ def encode( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + training_datasets=nvidia_training_datasets, + public_training_code=None, + public_training_data=True, ) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index a520bdca1..6bca544b1 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -6,6 +6,8 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader +from .bge_models import bge_training_data + rubert_tiny2 = ModelMeta( name="cointegrated/rubert-tiny2", languages=["rus_Cyrl"], @@ -96,20 +98,27 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, training_datasets={ - "deepvk/ru-HNP": ["train"], - "deepvk/ru-WANLI": ["train"], - "Shitao/bge-m3-data": ["train"], - "RussianNLP/russian_super_glue": ["train"], - "reciTAL/mlsum": ["train"], - "Helsinki-NLP/opus-100": ["train"], - "Helsinki-NLP/bible_para": ["train"], - "d0rj/rudetoxifier_data_detox": ["train"], - "s-nlp/ru_paradetox": ["train"], - "Milana/russian_keywords": ["train"], - "IlyaGusev/gazeta": ["train"], - "d0rj/gsm8k-ru": ["train"], - "bragovo/dsum_ru": ["train"], - "CarlBrendt/Summ_Dialog_News": ["train"], + "BibleNLPBitextMining": ["train"], + "MLSUMClusteringP2P": ["train"], + "MLSUMClusteringP2P.v2": ["train"], + "MLSUMClusteringS2S": ["train"], + "MLSUMClusteringS2S.v2": ["train"], + **bge_training_data, + # not MTEB: + # "deepvk/ru-HNP": ["train"], + # "deepvk/ru-WANLI": ["train"], + # "Shitao/bge-m3-data": ["train"], + # "RussianNLP/russian_super_glue": ["train"], + # "reciTAL/mlsum": ["train"], + # "Helsinki-NLP/opus-100": ["train"], + # "Helsinki-NLP/bible_para": ["train"], + # "d0rj/rudetoxifier_data_detox": ["train"], + # "s-nlp/ru_paradetox": ["train"], + # "Milana/russian_keywords": ["train"], + # "IlyaGusev/gazeta": ["train"], + # "d0rj/gsm8k-ru": ["train"], + # "bragovo/dsum_ru": ["train"], + # "CarlBrendt/Summ_Dialog_News": ["train"], }, ) @@ -213,7 +222,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets={"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + training_datasets=None, # source model in unknown + # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, ) labse_ru_turbo = ModelMeta( @@ -231,7 +241,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets={"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + training_datasets=None, # source model in unknown + # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, ) diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index b1d45b949..18db09a2b 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -40,6 +40,19 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + adapted_from="intfloat/e5-mistral-7b-instruct", + public_training_code=False, + public_training_data=False, + training_datasets={ # inherits from e5 + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + }, ) @@ -68,4 +81,16 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=False, + public_training_data=False, + training_datasets={ # inherits from e5 + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + }, ) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 18b08f16f..f8b01c6ea 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -60,6 +60,40 @@ "zho_Hant", ] +sent_trf_training_dataset = { + # derived from datasheets + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], +} + all_MiniLM_L6_v2 = ModelMeta( name="sentence-transformers/all-MiniLM-L6-v2", languages=["eng-Latn"], @@ -77,40 +111,31 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # does sentence transformer count? + training_datasets=sent_trf_training_dataset, + public_training_code=True, + public_training_data=True, +) + +all_MiniLM_L12_v2 = ModelMeta( + name="sentence-transformers/all-MiniLM-L12-v2", + languages=["eng-Latn"], + open_weights=True, + revision="364dd28d28dcd3359b537f3cf1f5348ba679da62", + release_date="2021-08-30", + n_parameters=33_400_000, + memory_usage=None, + embed_dim=384, + license="apache-2.0", + max_tokens=256, + reference="https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets=sent_trf_training_dataset, + public_training_code=True, public_training_data=True, - training_datasets={ - # source: frontmatter in readme - # trained on stack exchange, unsure if sources match - "StackExchangeClusteringP2P": ["test"], - "StackExchangeClusteringP2P.v2": ["test"], - "StackExchangeClustering": ["test"], - "StackExchangeClustering.v2": ["test"], - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], - # Non MTEB sources - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, ) paraphrase_multilingual_MiniLM_L12_v2 = ModelMeta( @@ -130,6 +155,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, + training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) + public_training_code=True, + public_training_data=True, ) paraphrase_multilingual_mpnet_base_v2 = ModelMeta( @@ -149,6 +177,20 @@ use_instructions=False, superseded_by=None, adapted_from=None, + training_datasets=sent_trf_training_dataset, + # + https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/paraphrases/training.py + # which include (not in MTEB): + # "all-nli": all_nli_train_dataset, + # "sentence-compression": sentence_compression_train_dataset, + # "simple-wiki": simple_wiki_train_dataset, + # "altlex": altlex_train_dataset, + # "quora-duplicates": quora_train_dataset, + # "coco-captions": coco_train_dataset, + # "flickr30k-captions": flickr_train_dataset, + # "yahoo-answers": yahoo_answers_train_dataset, + # "stack-exchange": stack_exchange_train_dataset, + public_training_code=True, + public_training_data=True, ) labse = ModelMeta( @@ -168,6 +210,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, + training_datasets=None, # scraped and mined webdata including CC, wiki, see section 3.1 https://aclanthology.org/2022.acl-long.62.pdf + public_training_code=True, # https://www.kaggle.com/models/google/labse/tensorFlow2/labse/2?tfhub-redirect=true + public_training_data=False, ) multi_qa_MiniLM_L6_cos_v1 = ModelMeta( @@ -186,7 +231,10 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, superseded_by=None, - adapted_from=None, + adapted_from="nreimers/MiniLM-L6-H384-uncased", + training_datasets=sent_trf_training_dataset, # assumed + public_training_code=None, + public_training_data=None, ) all_mpnet_base_v2 = ModelMeta( @@ -206,280 +254,11 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # does sentence transformer count? - public_training_data=True, - training_datasets={ - # source: frontmatter in readme - # trained on stack exchange, unsure if sources match - "StackExchangeClusteringP2P": ["test"], - "StackExchangeClusteringP2P.v2": ["test"], - "StackExchangeClustering": ["test"], - "StackExchangeClustering.v2": ["test"], - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], - # Non MTEB source - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, -) - -# Source: https://arxiv.org/pdf/1907.04307 -use_multilingual_languages = [ - "ara-Arab", # Arabic - "zho-Hans", # Chinese (Simplified, PRC) - "zho-Hant", # Chinese (Traditional, Taiwan) - "nld-Latn", # Dutch - "eng-Latn", # English - "deu-Latn", # German - "fra-Latn", # French - "ita-Latn", # Italian - "por-Latn", # Portuguese - "spa-Latn", # Spanish - "jpn-Jpan", # Japanese - "kor-Kore", # Korean - "rus-Cyrl", # Russian - "pol-Latn", # Polish - "tha-Thai", # Thai - "tur-Latn", # Turkish -] -use_multilingual_training_data = { - # I'm not certain since they mined this themselves, but I would assume that there is significant overlap - "StackOverflowQARetrieval": ["train", "test"], - # Not in MTEB: - # - SNLI translated to 15 languages (could have intersections with other NLI datasets) - # - Translation pairs: Mined from the internet - # - QA mined from Reddit, StackOverflow, YahooAnswers (could be problematic) -} -distiluse_base_multilingual_cased_v2 = ModelMeta( - name="sentence-transformers/distiluse-base-multilingual-cased-v2", - languages=use_multilingual_languages, - open_weights=True, - revision="dad0fa1ee4fa6e982d3adbce87c73c02e6aee838", - release_date="2021-06-22", # First commit - n_parameters=135 * 1e6, - memory_usage=None, - embed_dim=768, - license="apache-2.0", - max_tokens=512, - reference="https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, + training_datasets=sent_trf_training_dataset, public_training_code=True, public_training_data=True, - training_datasets=use_multilingual_training_data, ) -use_cmlm_multilingual = ModelMeta( - name="sentence-transformers/use-cmlm-multilingual", - languages=paraphrase_langs, - open_weights=True, - revision="6f8ff6583c371cbc4d6d3b93a5e37a888fd54574", - release_date="2022-04-14", # First commit - n_parameters=472 * 1e6, - memory_usage=None, - embed_dim=768, - license="apache-2.0", - max_tokens=256, - reference="https://huggingface.co/sentence-transformers/use-cmlm-multilingual", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from="sentence-transformers/LaBSE", - public_training_code=True, - public_training_data=True, - training_datasets={ - # Not in MTEB: - # - SNLI - # - Translation corpus based largely on Uszkoreit et al. (2010) - }, -) - - -jina_embeddings_v2_base_en = ModelMeta( - name="jinaai/jina-embeddings-v2-base-en", - languages=["eng-Latn"], - open_weights=True, - revision="6e85f575bc273f1fd840a658067d0157933c83f0", - release_date="2023-09-27", - n_parameters=137_000_000, - memory_usage=None, - embed_dim=768, - license="apache-2.0", - max_tokens=8192, - reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-en", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets={"allenai/c4": ["train"]}, -) - -jina_embeddings_v2_base_zh = ModelMeta( - name="jinaai/jina-embeddings-v2-base-zh", - languages=["eng-Latn", "zho-Hans"], - open_weights=True, - revision="c1ff9086a89a1123d7b5eff58055a665db4fb4b9", - release_date="2024-01-10", - n_parameters=161_000_000, - memory_usage=None, - embed_dim=768, - license="apache-2.0", - max_tokens=8192, - reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets={ - # source: https://arxiv.org/pdf/2402.17016 - "XNLI": ["train"], - "MLSumClusteringS2S": ["train"], - "MLSumClusteringP2P": ["train"], - # Not in MTEB: - # - MQA - # - XLSUM - }, -) - - -jina_embeddings_v2_small_en = ModelMeta( - name="jinaai/jina-embeddings-v2-small-en", - languages=["eng-Latn"], - open_weights=True, - revision="796cff318cdd4e5fbe8b7303a1ef8cbec36996ef", - release_date="2023-09-27", - n_parameters=32_700_000, - memory_usage=None, - embed_dim=512, - license="apache-2.0", - max_tokens=8192, - reference="https://huggingface.co/jinaai/jina-embeddings-v2-small-en", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, -) - -jina_embedding_b_en_v1 = ModelMeta( - name="jinaai/jina-embedding-b-en-v1", - languages=["eng-Latn"], - open_weights=True, - revision="aa0645035294a8c0607ce5bb700aba982cdff32c", - release_date="2023-07-07", - n_parameters=110_000_000, - memory_usage=None, - embed_dim=768, - license="apache-2.0", - max_tokens=512, - reference="https://huggingface.co/jinaai/jina-embedding-b-en-v1", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by="jinaai/jina-embeddings-v2-base-en", - adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, -) - -jina_embedding_s_en_v1 = ModelMeta( - name="jinaai/jina-embedding-s-en-v1", - languages=["eng-Latn"], - open_weights=True, - revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", - release_date="2023-07-07", - n_parameters=35_000_000, - memory_usage=None, - embed_dim=512, - license="apache-2.0", - max_tokens=512, - reference="https://huggingface.co/jinaai/jina-embedding-s-en-v1", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by="jinaai/jina-embeddings-v2-small-en", - adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, -) - - -all_MiniLM_L12_v2 = ModelMeta( - name="sentence-transformers/all-MiniLM-L12-v2", - languages=["eng-Latn"], - open_weights=True, - revision="364dd28d28dcd3359b537f3cf1f5348ba679da62", - release_date="2021-08-30", - n_parameters=33_400_000, - memory_usage=None, - embed_dim=384, - license="apache-2.0", - max_tokens=256, - reference="https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - public_training_code=False, # does sentence transformer count? - public_training_data=True, - training_datasets={ - # source: frontmatter in readme - # trained on stack exchange, unsure if sources match - "StackExchangeClusteringP2P": ["test"], - "StackExchangeClusteringP2P.v2": ["test"], - "StackExchangeClustering": ["test"], - "StackExchangeClustering.v2": ["test"], - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], - # Non MTEB sources - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, -) microllama_text_embedding = ModelMeta( name="keeeeenw/MicroLlama-text-embedding", @@ -499,9 +278,11 @@ superseded_by=None, adapted_from=None, training_datasets={ - # shource yaml header: - "NQ": ["test"] - # not in MTEB: + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB # "sentence-transformers/all-nli": ["train"], # "sentence-transformers/stsb": ["train"], # "sentence-transformers/quora-duplicates": ["train"], diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index c7f9aad9f..1e04b4116 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -28,6 +28,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_400M_v5", + training_datasets=None, + public_training_data=False, # currently not released + public_training_code=False, ) stella_en_1_5b = ModelMeta( @@ -52,6 +55,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", + training_datasets=None, + public_training_data=False, # currently not released + public_training_code=False, ) stella_large_zh_v3_1792d = ModelMeta( diff --git a/mteb/models/uae_models.py b/mteb/models/uae_models.py index 5c47cba67..ffdaa29f7 100644 --- a/mteb/models/uae_models.py +++ b/mteb/models/uae_models.py @@ -75,4 +75,13 @@ def encode( framework=["Sentence Transformers", "PyTorch"], reference="https://huggingface.co/WhereIsAI/UAE-Large-V1", use_instructions=True, + training_datasets={ + # source: https://arxiv.org/pdf/2309.12871 + # not in MTEB + "MNLI": [], + "NLI": [], + "SNLI": [], + }, + public_training_data=True, + public_training_code=True, ) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index 70f61e2c5..12925b235 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -157,6 +157,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_finance_2 = ModelMeta( @@ -179,6 +182,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_law_2 = ModelMeta( @@ -201,6 +207,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_code_2 = ModelMeta( @@ -223,6 +232,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_large_2 = ModelMeta( @@ -245,6 +257,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_2 = ModelMeta( @@ -267,6 +282,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, + public_training_code=False, ) voyage_multilingual_2 = ModelMeta( name="voyageai/voyage-multilingual-2", @@ -288,6 +306,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_3 = ModelMeta( @@ -310,6 +331,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) voyage_3_lite = ModelMeta( @@ -332,4 +356,7 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + training_datasets=None, + public_training_data=False, # couldn't find + public_training_code=False, ) From 9823529282b131e7f24399eb0639fbc33280d148 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 17 Jan 2025 14:55:32 +0100 Subject: [PATCH 26/49] fix: Added Misc Chinese models (#1819) * Added moka and piccolo models to overview file * Added Text2Vec models * Added various Chinese embedding models --------- Co-authored-by: Isaac Chung --- mteb/models/misc_models.py | 88 ++++++++++++++++++++++++++++ mteb/models/overview.py | 5 ++ mteb/models/text2vec_models.py | 103 +++++++++++++++++++++++++++++++++ 3 files changed, 196 insertions(+) create mode 100644 mteb/models/text2vec_models.py diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 88dad0050..09e423240 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -1738,3 +1738,91 @@ training_datasets=None, # They don't specify superseded_by=None, ) +xiaobu_embedding = ModelMeta( + name="lier007/xiaobu-embedding", + revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92", + release_date="2024-01-09", + languages=["zho_Hans"], + loader=None, + n_parameters=326 * 1e6, + memory_usage=None, + max_tokens=512, + embed_dim=1024, + license="not specified", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/lier007/xiaobu-embedding", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, # Finetuned from GTE, none of them disclose training data + superseded_by=None, + adapted_from="thenlper/gte-large-zh", +) +xiaobu_embedding_v2 = ModelMeta( + name="lier007/xiaobu-embedding-v2", + revision="1912f2e59a5c2ef802a471d735a38702a5c9485e", + release_date="2024-06-30", + languages=["zho_Hans"], + loader=None, + n_parameters=326 * 1e6, + memory_usage=None, + max_tokens=512, + embed_dim=768, + license="not specified", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/lier007/xiaobu-embedding-v2", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, # Finetuned from piccolo-embedding, none of them say + superseded_by=None, + adapted_from="sensenova/piccolo-base-zh", +) +yinka_embedding = ModelMeta( + name="Classical/Yinka", + revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92", + release_date="2024-01-09", + languages=["zho_Hans"], + loader=None, + n_parameters=326 * 1e6, + memory_usage=None, + max_tokens=512, + embed_dim=1024, + license="not specified", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Classical/Yinka", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, # Not disclosed + superseded_by=None, + adapted_from="dunzhang/stella-mrl-large-zh-v3.5-1792d", +) +conan_embedding = ModelMeta( + name="TencentBAC/Conan-embedding-v1", + revision="bb9749a57d4f02fd71722386f8d0f5a9398d7eeb", + release_date="2024-08-22", + languages=["zho_Hans"], + loader=None, + n_parameters=326 * 1e6, + memory_usage=None, + max_tokens=512, + embed_dim=768, + license="cc-by-nc-4.0", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/Classical/Yinka", + similarity_fn_name="cosine", + use_instructions=None, + # source: https://arxiv.org/pdf/2408.15710 + training_datasets=None, # They "scraped" things from the internet, we don't know, could be leakage + superseded_by=None, +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 634530089..ea0fa1524 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -36,6 +36,7 @@ nomic_models, nvidia_models, openai_models, + piccolo_models, promptriever_models, repllama_models, rerankers_custom, @@ -44,6 +45,7 @@ salesforce_models, sentence_transformers_models, stella_models, + text2vec_models, uae_models, voyage_models, ) @@ -69,11 +71,13 @@ llm2vec_models, mxbai_models, model2vec_models, + moka_models, misc_models, nomic_models, no_instruct_sentence_models, nvidia_models, openai_models, + piccolo_models, promptriever_models, repllama_models, rerankers_custom, @@ -88,6 +92,7 @@ jina_models, jasper_models, uae_models, + text2vec_models, stella_models, uae_models, voyage_models, diff --git a/mteb/models/text2vec_models.py b/mteb/models/text2vec_models.py new file mode 100644 index 000000000..e26108e0a --- /dev/null +++ b/mteb/models/text2vec_models.py @@ -0,0 +1,103 @@ +"""Implementation of Text2Vec models""" + +from __future__ import annotations + +from mteb.model_meta import ModelMeta + +# I couldn't find the large model on HF for some reason +text2vec_base_chinese = ModelMeta( + name="shibing624/text2vec-base-chinese", + languages=["zho-Hans"], + open_weights=True, + revision="183bb99aa7af74355fb58d16edf8c13ae7c5433e", + release_date="2022-01-23", + n_parameters=102 * 1e6, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/shibing624/text2vec-base-chinese", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=False, # Couldn't find it + public_training_data=True, + training_datasets={ + # source: https://huggingface.co/shibing624/text2vec-base-chinese + # Not in MTEB + # - shibing624/nli-zh-all/text2vec-base-chinese-sentence-dataset + # (Could have overlaps I'm not aware of) + }, +) + +text2vec_base_chinese_paraphrase = ModelMeta( + name="shibing624/text2vec-base-chinese-paraphrase", + languages=["zho-Hans"], + open_weights=True, + revision="e90c150a9c7fb55a67712a766d6820c55fb83cdd", + release_date="2023-06-19", + n_parameters=118 * 1e6, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/shibing624/text2vec-base-chinese-paraphrase", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=False, # Couldn't find it + public_training_data=True, + training_datasets={ + # source: https://huggingface.co/shibing624/text2vec-base-chinese + # Not in MTEB + # - shibing624/nli-zh-all/text2vec-base-chinese-paraphrase + # (Could have overlaps I'm not aware of) + }, +) + + +text2vec_multi_langs = [ + "deu-Latn", # German (de) + "eng-Latn", # English (en) + "spa-Latn", # Spanish (es) + "fra-Latn", # French (fr) + "ita-Latn", # Italian (it) + "nld-Latn", # Dutch (nl) + "pol-Latn", # Polish (pl) + "por-Latn", # Portuguese (pt) + "rus-Cyrl", # Russian (ru) + "zho-Hans", # Chinese (Simplified, zh) +] +text2vec_base_multilingual = ModelMeta( + name="shibing624/text2vec-base-multilingual", + languages=text2vec_multi_langs, + open_weights=True, + revision="6633dc49e554de7105458f8f2e96445c6598e9d1", + release_date="2023-06-22", + # While it can be loaded with SBERT, it has one suspicious file according to huggingface + # So probably best not to. + loader=None, + n_parameters=118 * 1e6, + memory_usage=None, + embed_dim=384, + license="apache-2.0", + max_tokens=256, + reference="https://huggingface.co/shibing624/text2vec-base-chinese-paraphrase", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + public_training_code=False, # Couldn't find it + public_training_data=True, + training_datasets={ + # source: https://huggingface.co/shibing624/text2vec-base-chinese + # Not in MTEB + # - shibing624/nli-zh-all/tree/main/text2vec-base-multilingual-dataset + # # (Could have overlaps I'm not aware of) + }, +) From b4d0eaa9ce60bd7661e4bfef5d75edc79bbee83b Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 17 Jan 2025 14:04:41 +0000 Subject: [PATCH 27/49] 1.29.8 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1e1d07e99..61fbf7521 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.7" +version = "1.29.8" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 96f639bc34153caaac422a3a13e0d9f3626d65b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 17 Jan 2025 15:54:37 +0100 Subject: [PATCH 28/49] fix: Fixed eval split for MultilingualSentiment in C-MTEB (#1804) * Fixed eval split for MultilingualSentiment in C-MTEB * FIxed splits for atec, bq and stsb in C-MTEB --- mteb/benchmarks/benchmarks.py | 75 +++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 34 deletions(-) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 7d8aedc9d..97dc6acb6 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1152,40 +1152,47 @@ def load_results( C_MTEB = Benchmark( name="MTEB(Chinese)", - tasks=get_tasks( - tasks=[ - "T2Retrieval", - "MMarcoRetrieval", - "DuRetrieval", - "CovidRetrieval", - "CmedqaRetrieval", - "EcomRetrieval", - "MedicalRetrieval", - "VideoRetrieval", - "T2Reranking", - "MMarcoReranking", - "CMedQAv1-reranking", - "CMedQAv2-reranking", - "Ocnli", - "Cmnli", - "CLSClusteringS2S", - "CLSClusteringP2P", - "ThuNewsClusteringS2S", - "ThuNewsClusteringP2P", - "ATEC", - "BQ", - "LCQMC", - "PAWSX", - "STSB", - "AFQMC", - "QBQTC", - "TNews", - "IFlyTek", - "Waimai", - "OnlineShopping", - "MultilingualSentiment", - "JDReview", - ], + tasks=MTEBTasks( + get_tasks( + tasks=[ + "T2Retrieval", + "MMarcoRetrieval", + "DuRetrieval", + "CovidRetrieval", + "CmedqaRetrieval", + "EcomRetrieval", + "MedicalRetrieval", + "VideoRetrieval", + "T2Reranking", + "MMarcoReranking", + "CMedQAv1-reranking", + "CMedQAv2-reranking", + "Ocnli", + "Cmnli", + "CLSClusteringS2S", + "CLSClusteringP2P", + "ThuNewsClusteringS2S", + "ThuNewsClusteringP2P", + "LCQMC", + "PAWSX", + "AFQMC", + "QBQTC", + "TNews", + "IFlyTek", + "Waimai", + "OnlineShopping", + "JDReview", + ], + ) + + get_tasks(tasks=["MultilingualSentiment"], eval_splits=["test"]) + + get_tasks( + tasks=[ + "ATEC", + "BQ", + "STSB", + ], + eval_splits=["validation"], + ) ), description="The Chinese Massive Text Embedding Benchmark (C-MTEB) is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets.", reference="https://github.com/FlagOpen/FlagEmbedding/tree/master/research/C_MTEB", From 762f729b70cdad2e9137a68af4b2693ca96dd3b7 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 17 Jan 2025 15:09:41 +0000 Subject: [PATCH 29/49] 1.29.9 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 61fbf7521..6ec3f11e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.8" +version = "1.29.9" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 8be6b2e36abb005822e07c034484c245345f6eb2 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Mon, 20 Jan 2025 08:50:33 +0300 Subject: [PATCH 30/49] fix: subsets to run (#1830) * fix split evals * add test * lint * fix moka * add assert --- mteb/abstasks/AbsTask.py | 2 +- tests/test_benchmark/mock_tasks.py | 10 ++++------ tests/test_evaluation/test_split_evaluation.py | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index 1d2e4fcb0..1ec1ebc4f 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -118,7 +118,7 @@ def evaluate( hf_subsets = copy(self.hf_subsets) if subsets_to_run is not None: # allow overwrites of pre-filtering - hf_subsets = subsets_to_run + hf_subsets = [s for s in hf_subsets if s in subsets_to_run] for hf_subset in hf_subsets: logger.info( diff --git a/tests/test_benchmark/mock_tasks.py b/tests/test_benchmark/mock_tasks.py index d3a11b2a4..7d3d2d752 100644 --- a/tests/test_benchmark/mock_tasks.py +++ b/tests/test_benchmark/mock_tasks.py @@ -927,12 +927,10 @@ def load_data(self, **kwargs): ), } - self.dataset = DatasetDict( - { - "eng": data, - "fra": data, - } - ) + self.dataset = {} + for lang in self.hf_subsets: + self.dataset[lang] = data + self.data_loaded = True @property diff --git a/tests/test_evaluation/test_split_evaluation.py b/tests/test_evaluation/test_split_evaluation.py index 7ce3f512e..7db10e09d 100644 --- a/tests/test_evaluation/test_split_evaluation.py +++ b/tests/test_evaluation/test_split_evaluation.py @@ -8,6 +8,7 @@ ) from tests.test_benchmark.mock_tasks import ( MockMultilingualRetrievalTask, + MockMultilingualSTSTask, MockRetrievalTask, ) @@ -362,3 +363,18 @@ def test_all_splits_subsets_evaluated_with_overwrite( for split in ["test"]: assert len(results2[0].scores[split]) == 2 assert sorted(results2[0].languages) == ["eng", "fra"] + + +def test_splits_evaluated_with_prefiltering(): + """Test that the evaluation only runs on the specified languages. Issue https://github.com/embeddings-benchmark/mteb/pull/1787#issuecomment-2598205049""" + task = MockMultilingualSTSTask().filter_languages(languages=["fra"]) + + evaluation = MTEB(tasks=[task]) + + results = evaluation.run(MockSentenceTransformer(), overwrite_results=True) + result_scores = results[0].scores + + assert len(result_scores) == 1 + assert "test" in result_scores + assert len(result_scores["test"]) == 1 + assert result_scores["test"][0]["hf_subset"] == "fra" From 0a83e383efe41e86e51c0d4cdca18d9ed5d42821 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Mon, 20 Jan 2025 08:52:02 +0300 Subject: [PATCH 31/49] fix: Remove default params, `public_training_data` and `memory usage` in `ModelMeta` (#1794) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: Leaderboard: `K` instead of `M` Fixes #1752 * format * fixed existing annotations to refer to task name instead of hf dataset * added annotation to nvidia * added voyage * added uae annotations * Added stella annotations * sentence trf models * added salesforce and e5 * jina * bge + model2vec * added llm2vec annotations * add jasper * format * format * Updated annotations and moved jina models * make models parameters needed to be filled * fix tests * remove comments * remove model meta from test * fix model meta from split * fix: add even more training dataset annotations (#1793) * fix: update max tokens for OpenAI (#1772) update max tokens * ci: skip AfriSentiLID for now (#1785) * skip AfriSentiLID for now * skip relevant test case instead --------- Co-authored-by: Isaac Chung * 1.28.7 Automatically generated by python-semantic-release * ci: fix model loading test (#1775) * pass base branch into the make command as an arg * test a file that has custom wrapper * what about overview * just dont check overview * revert instance check * explicitly omit overview and init * remove test change * try on a lot of models * revert test model file --------- Co-authored-by: Isaac Chung * feat: Update task filtering, fixing bug which included cross-lingual tasks in overly many benchmarks (#1787) * feat: Update task filtering, fixing bug on MTEB - Updated task filtering adding exclusive_language_filter and hf_subset - fix bug in MTEB where cross-lingual splits were included - added missing language filtering to MTEB(europe, beta) and MTEB(indic, beta) The following code outlines the problems: ```py import mteb from mteb.benchmarks import MTEB_ENG_CLASSIC task = [t for t in MTEB_ENG_CLASSIC.tasks if t.metadata.name == "STS22"][0] # was eq. to: task = mteb.get_task("STS22", languages=["eng"]) task.hf_subsets # correct filtering to English datasets: # ['en', 'de-en', 'es-en', 'pl-en', 'zh-en'] # However it should be: # ['en'] # with the changes it is: task = [t for t in MTEB_ENG_CLASSIC.tasks if t.metadata.name == "STS22"][0] task.hf_subsets # ['en'] # eq. to task = mteb.get_task("STS22", hf_subsets=["en"]) # which you can also obtain using the exclusive_language_filter (though not if there was multiple english splits): task = mteb.get_task("STS22", languages=["eng"], exclusive_language_filter=True) ``` * format * remove "en-ext" from AmazonCounterfactualClassification * fixed mteb(deu) * fix: simplify in a few areas * fix: Add gritlm * 1.29.0 Automatically generated by python-semantic-release * fix: Added more annotations! * fix: Added C-MTEB (#1786) Added C-MTEB * 1.29.1 Automatically generated by python-semantic-release * docs: Add contact to MMTEB benchmarks (#1796) * Add myself to MMTEB benchmarks * lint * fix: loading pre 11 (#1798) * fix loading pre 11 * add similarity * lint * run all task types * 1.29.2 Automatically generated by python-semantic-release * fix: allow to load no revision available (#1801) * fix allow to load no revision available * lint * add require_model_meta to leaderboard * lint * 1.29.3 Automatically generated by python-semantic-release --------- Co-authored-by: Roman Solomatin Co-authored-by: Isaac Chung Co-authored-by: Isaac Chung Co-authored-by: github-actions Co-authored-by: Márton Kardos * fig merges * update models info * change public_training_code to str * change `public_training_code=False` to None * remove annotations * remove annotations * remove changed annotations * remove changed annotations * remove `public_training_data` and `memory usage` * make framework not optional * make framework non-optional * empty frameworks * add framework * fix tests * Update mteb/models/overview.py Co-authored-by: Isaac Chung --------- Co-authored-by: Kenneth Enevoldsen Co-authored-by: Isaac Chung Co-authored-by: Isaac Chung Co-authored-by: github-actions Co-authored-by: Márton Kardos --- mteb/model_meta.py | 24 ++-- mteb/models/arctic_models.py | 31 ++--- mteb/models/bge_models.py | 18 +-- mteb/models/bm25.py | 5 +- mteb/models/cohere_models.py | 16 +-- mteb/models/colbert_models.py | 6 +- mteb/models/e5_instruct.py | 8 +- mteb/models/e5_models.py | 36 ++--- mteb/models/google_models.py | 12 +- mteb/models/gritlm_models.py | 8 +- mteb/models/gte_models.py | 18 +-- mteb/models/ibm_granite_models.py | 16 ++- mteb/models/inf_models.py | 4 +- mteb/models/jasper_models.py | 2 - mteb/models/jina_models.py | 19 +-- mteb/models/linq_models.py | 3 +- mteb/models/llm2vec_models.py | 32 ++--- mteb/models/misc_models.py | 138 -------------------- mteb/models/model2vec_models.py | 21 +-- mteb/models/moka_models.py | 12 +- mteb/models/mxbai_models.py | 3 +- mteb/models/no_instruct_sentence_models.py | 3 +- mteb/models/nomic_models.py | 15 ++- mteb/models/nvidia_models.py | 4 - mteb/models/openai_models.py | 16 +-- mteb/models/overview.py | 41 +++++- mteb/models/piccolo_models.py | 8 +- mteb/models/promptriever_models.py | 8 +- mteb/models/repllama_models.py | 5 +- mteb/models/rerankers_custom.py | 27 ++++ mteb/models/rerankers_monot5_based.py | 120 +++++++++++++++++ mteb/models/ru_sentence_models.py | 41 ++++-- mteb/models/salesforce_models.py | 8 +- mteb/models/sentence_transformers_models.py | 28 +--- mteb/models/stella_models.py | 22 +--- mteb/models/text2vec_models.py | 12 +- mteb/models/uae_models.py | 3 +- mteb/models/voyage_models.py | 36 ++--- scripts/generate_metadata.py | 11 +- tests/test_tasks/test_mteb_rerank.py | 14 +- 40 files changed, 386 insertions(+), 468 deletions(-) diff --git a/mteb/model_meta.py b/mteb/model_meta.py index 0a1befc2c..b105f301b 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -59,7 +59,6 @@ class ModelMeta(BaseModel): name: The name of the model, ideally the name on huggingface. n_parameters: The number of parameters in the model, e.g. 7_000_000 for a 7M parameter model. Can be None if the the number of parameters is not known (e.g. for proprietary models) or if the loader returns a SentenceTransformer model from which it can be derived. - memory_usage: The amount of memory the model uses in GB. Can be None if the memory usage is not known (e.g. for proprietary models). max_tokens: The maximum number of tokens the model can handle. Can be None if the maximum number of tokens is not known (e.g. for proprietary models). embed_dim: The dimension of the embeddings produced by the model. Currently all models are assumed to produce fixed-size embeddings. @@ -67,7 +66,6 @@ class ModelMeta(BaseModel): release_date: The date the model's revision was released. license: The license under which the model is released. Required if open_weights is True. open_weights: Whether the model is open source or proprietary. - public_training_data: Whether the training data used to train the model is publicly available. public_training_code: Whether the code used to train the model is publicly available. similarity_fn_name: The distance metric used by the model. framework: The framework the model is implemented in, can be a list of frameworks e.g. `["Sentence Transformers", "PyTorch"]`. @@ -90,19 +88,17 @@ class ModelMeta(BaseModel): release_date: STR_DATE | None languages: list[ISO_LANGUAGE_SCRIPT] | None loader: Callable[..., Encoder] | None = None - n_parameters: int | None = None - memory_usage: float | None = None - max_tokens: float | None = None - embed_dim: int | None = None - license: str | None = None - open_weights: bool | None = None - public_training_data: bool | None = None - public_training_code: bool | None = None - framework: list[FRAMEWORKS] = [] + n_parameters: int | None + max_tokens: float | None + embed_dim: int | None + license: str | None + open_weights: bool | None + public_training_code: str | None + framework: list[FRAMEWORKS] reference: STR_URL | None = None - similarity_fn_name: DISTANCE_METRICS | None = None - use_instructions: bool | None = None - training_datasets: dict[str, list[str]] | None = None + similarity_fn_name: DISTANCE_METRICS | None + use_instructions: bool | None + training_datasets: dict[str, list[str]] | None adapted_from: str | None = None superseded_by: str | None = None diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index b4c2b97ac..66822d41b 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -94,7 +94,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=22_600_000, - memory_usage=None, max_tokens=512, embed_dim=384, license="apache-2.0", @@ -103,8 +102,7 @@ use_instructions=True, adapted_from="sentence-transformers/all-MiniLM-L6-v2", superseded_by=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -145,7 +143,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=32_200_000, - memory_usage=None, max_tokens=512, embed_dim=384, license="apache-2.0", @@ -154,8 +151,7 @@ use_instructions=True, adapted_from="intfloat/e5-small-unsupervised", superseded_by=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -196,7 +192,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=109_000_000, - memory_usage=None, max_tokens=512, embed_dim=768, license="apache-2.0", @@ -205,8 +200,7 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v1.5", - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -247,7 +241,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=137_000_000, - memory_usage=None, max_tokens=2048, embed_dim=768, license="apache-2.0", @@ -256,8 +249,7 @@ use_instructions=True, adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -298,7 +290,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=335_000_000, - memory_usage=None, max_tokens=512, embed_dim=1024, license="apache-2.0", @@ -307,8 +298,7 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0", - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -351,7 +341,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=109_000_000, - memory_usage=None, max_tokens=512, embed_dim=768, license="apache-2.0", @@ -360,6 +349,8 @@ use_instructions=True, adapted_from=None, superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", + public_training_code=None, + training_datasets=None, ) arctic_embed_m_v2_0 = ModelMeta( @@ -376,7 +367,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=305_000_000, - memory_usage=None, max_tokens=8192, embed_dim=768, license="apache-2.0", @@ -385,8 +375,7 @@ use_instructions=True, adapted_from="Alibaba-NLP/gte-multilingual-base", superseded_by=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -426,7 +415,6 @@ open_weights=True, framework=["Sentence Transformers", "PyTorch"], n_parameters=568_000_000, - memory_usage=None, max_tokens=8192, embed_dim=1024, license="apache-2.0", @@ -435,8 +423,7 @@ use_instructions=True, adapted_from="BAAI/bge-m3-retromae", superseded_by=None, - public_training_data=False, # couldn't find - public_training_code=False, # couldn't find + public_training_code=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 05547d6a0..d8270c573 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -365,7 +365,6 @@ revision="5c38ec7c405ec4b44b94cc5a9bb96e735b38267a", release_date="2023-09-12", # initial commit of hf model. n_parameters=24_000_000, - memory_usage=None, embed_dim=512, license="mit", max_tokens=512, @@ -373,7 +372,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken training_datasets=bge_training_data, ) @@ -391,7 +389,6 @@ revision="a5beb1e3e68b9ab74eb54cfd186867f64f240e1a", release_date="2023-09-11", # initial commit of hf model. n_parameters=438_000_000, - memory_usage=None, embed_dim=768, license="mit", max_tokens=512, @@ -399,7 +396,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken training_datasets=bge_training_data, ) @@ -417,7 +413,6 @@ revision="d4aa6901d3a41ba39fb536a557fa166f842b0e09", release_date="2023-09-12", # initial commit of hf model. n_parameters=1_340_000_000, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=512, @@ -425,7 +420,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken training_datasets=bge_training_data, ) @@ -443,7 +437,6 @@ revision="7999e1d3359715c523056ef9478215996d62a620", release_date="2023-09-12", # initial commit of hf model. n_parameters=24_000_000, - memory_usage=None, embed_dim=512, license="mit", max_tokens=512, @@ -451,7 +444,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken training_datasets=bge_chinese_training_data, ) @@ -469,7 +461,6 @@ revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65", release_date="2023-09-11", # initial commit of hf model. n_parameters=438_000_000, - memory_usage=None, embed_dim=768, license="mit", max_tokens=512, @@ -477,7 +468,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken training_datasets=bge_chinese_training_data, ) @@ -495,7 +485,6 @@ revision="79e7739b6ab944e86d6171e44d24c997fc1e0116", release_date="2023-09-12", # initial commit of hf model. n_parameters=1_340_000_000, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=512, @@ -503,7 +492,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken training_datasets=bge_chinese_training_data, ) @@ -520,7 +508,6 @@ revision="5617a9f61b028005a4858fdac845db406aefb181", release_date="2024-06-28", n_parameters=568_000_000, - memory_usage=None, embed_dim=4096, license="mit", max_tokens=8194, @@ -528,7 +515,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_data=True, public_training_code=None, training_datasets=bgem3_training_data, ) @@ -555,7 +541,6 @@ revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a", release_date="2024-07-25", # initial commit of hf model. n_parameters=9.24 * 1e9, - memory_usage=None, embed_dim=3584, # from old C-MTEB leaderboard license="gemma", max_tokens=8192, # from old C-MTEB leaderboard @@ -563,7 +548,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=None, # not disclosed ) diff --git a/mteb/models/bm25.py b/mteb/models/bm25.py index 423175270..ea56fd432 100644 --- a/mteb/models/bm25.py +++ b/mteb/models/bm25.py @@ -131,12 +131,13 @@ def encode(self, texts: list[str], **kwargs): revision="0_1_10", release_date="2024-07-10", ## release of version 0.1.10 n_parameters=None, - memory_usage=None, embed_dim=None, license=None, max_tokens=None, - reference=None, + reference="https://github.com/xhluca/bm25s", similarity_fn_name=None, framework=[], use_instructions=False, + public_training_code="https://github.com/xhluca/bm25s", + training_datasets=None, ) diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py index 4b34045f8..8718a2e2a 100644 --- a/mteb/models/cohere_models.py +++ b/mteb/models/cohere_models.py @@ -227,7 +227,6 @@ def encode( revision="1", release_date="2023-11-02", n_parameters=None, - memory_usage=None, max_tokens=None, embed_dim=512, reference="https://cohere.com/blog/introducing-embed-v3", @@ -235,8 +234,7 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) @@ -253,15 +251,13 @@ def encode( revision="1", release_date="2023-11-02", n_parameters=None, - memory_usage=None, max_tokens=512, embed_dim=1024, license=None, similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) @@ -278,15 +274,13 @@ def encode( reference="https://cohere.com/blog/introducing-embed-v3", release_date="2023-11-02", n_parameters=None, - memory_usage=None, max_tokens=512, embed_dim=384, license=None, similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) @@ -303,14 +297,12 @@ def encode( revision="1", release_date="2023-11-02", n_parameters=None, - memory_usage=None, max_tokens=512, embed_dim=384, license=None, similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index 8753791bf..87b5fdb93 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -152,7 +152,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: languages=["eng_Latn"], open_weights=True, revision="c1e84128e85ef755c096a95bdb06b47793b13acf", - public_training_code=True, + public_training_code=None, release_date="2024-09-21", n_parameters=110 * 1e6, max_tokens=180, # Reduced for Benchmarking - see ColBERT paper @@ -164,6 +164,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: use_instructions=False, adapted_from=None, superseded_by=None, + training_datasets=None, ) @@ -203,7 +204,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: ], open_weights=True, revision="4cf816e5e2b03167b132a3c847a9ecd48ba708e1", - public_training_code=False, + public_training_code=None, release_date="2024-08-16", n_parameters=559 * 1e6, max_tokens=8192, @@ -215,4 +216,5 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: use_instructions=False, adapted_from=None, superseded_by=None, + training_datasets=None, ) diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index 182a6ea4b..f4d590935 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -36,12 +36,10 @@ use_instructions=True, reference="https://huggingface.co/intfloat/multilingual-e5-large-instruct", n_parameters=560_000_000, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=514, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -68,11 +66,9 @@ use_instructions=True, reference="https://huggingface.co/intfloat/e5-mistral-7b-instruct", n_parameters=7_111_000_000, - memory_usage=None, embed_dim=4096, license="mit", max_tokens=32768, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index 9537824e5..ace25ca08 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -139,7 +139,6 @@ revision="fd1525a9fd15316a2d503bf26ab031a61d056e98", release_date=E5_PAPER_RELEASE_DATE, n_parameters=118_000_000, - memory_usage=None, embed_dim=384, license="mit", max_tokens=512, @@ -147,8 +146,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -164,7 +162,6 @@ revision="d13f1b27baf31030b7fd040960d60d909913633f", release_date=E5_PAPER_RELEASE_DATE, n_parameters=278_000_000, - memory_usage=None, embed_dim=768, license="mit", max_tokens=514, @@ -172,8 +169,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -190,7 +186,6 @@ revision="ab10c1a7f42e74530fe7ae5be82e6d4f11a719eb", release_date=E5_PAPER_RELEASE_DATE, n_parameters=560_000_000, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=514, @@ -198,8 +193,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -215,7 +209,6 @@ revision="dca8b1a9dae0d4575df2bf423a5edb485a431236", release_date=E5_PAPER_RELEASE_DATE, n_parameters=33_000_000, - memory_usage=None, embed_dim=384, license="mit", max_tokens=512, @@ -223,8 +216,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -241,7 +233,6 @@ revision="e272f3049e853b47cb5ca3952268c6662abda68f", release_date=E5_PAPER_RELEASE_DATE, n_parameters=33_000_000, - memory_usage=None, embed_dim=384, license="mit", max_tokens=512, @@ -249,8 +240,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -267,7 +257,6 @@ revision="1c644c92ad3ba1efdad3f1451a637716616a20e8", release_date=E5_PAPER_RELEASE_DATE, n_parameters=109_000_000, - memory_usage=None, embed_dim=768, license="mit", max_tokens=512, @@ -277,8 +266,7 @@ use_instructions=True, superseded_by=None, adapted_from=None, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -295,7 +283,6 @@ revision="b322e09026e4ea05f42beadf4d661fb4e101d311", release_date=E5_PAPER_RELEASE_DATE, n_parameters=335_000_000, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=514, @@ -305,8 +292,7 @@ use_instructions=True, superseded_by=None, adapted_from=None, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -323,7 +309,6 @@ revision="4dc6d853a804b9c8886ede6dda8a073b7dc08a81", release_date="2022-12-26", n_parameters=335_000_000, - memory_usage=None, embed_dim=1024, license="apache-2.0", max_tokens=512, @@ -333,8 +318,7 @@ use_instructions=True, superseded_by="intfloat/e5-large-v2", adapted_from=None, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) @@ -351,7 +335,6 @@ revision="b533fe4636f4a2507c08ddab40644d20b0006d6a", release_date="2022-12-26", n_parameters=109_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -361,7 +344,6 @@ use_instructions=True, superseded_by="intfloat/e5-base-v2", adapted_from=None, - public_training_data=False, - public_training_code=False, + public_training_code=None, training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/google_models.py b/mteb/models/google_models.py index 1b4a4a13f..08065f7af 100644 --- a/mteb/models/google_models.py +++ b/mteb/models/google_models.py @@ -145,15 +145,13 @@ def encode( revision="1", # revision is intended for implementation release_date="2024-05-14", n_parameters=None, - memory_usage=None, max_tokens=2048, embed_dim=768, license=None, similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) @@ -169,15 +167,13 @@ def encode( revision="1", # revision is intended for implementation release_date="2024-11-18", n_parameters=None, - memory_usage=None, max_tokens=2048, embed_dim=768, license=None, similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) @@ -193,14 +189,12 @@ def encode( revision="1", # revision is intended for implementation release_date="2024-05-14", n_parameters=None, - memory_usage=None, max_tokens=2048, embed_dim=768, license=None, similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index a4f5befd1..a68502b06 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -31,7 +31,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: revision="13f00a0e36500c80ce12870ea513846a066004af", release_date="2024-02-15", n_parameters=7_240_000_000, - memory_usage=None, embed_dim=4096, license="apache-2.0", max_tokens=4096, @@ -41,8 +40,7 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: use_instructions=True, training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data - public_training_code=True, # https://github.com/ContextualAI/gritlm - public_training_data=False, + public_training_code="https://github.com/ContextualAI/gritlm", ) gritlm8x7b = ModelMeta( loader=partial( # type: ignore @@ -58,7 +56,6 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: revision="7f089b13e3345510281733ca1e6ff871b5b4bc76", release_date="2024-02-15", n_parameters=57_920_000_000, - memory_usage=None, embed_dim=4096, license="apache-2.0", max_tokens=4096, @@ -68,6 +65,5 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: use_instructions=True, training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data - public_training_code=True, # https://github.com/ContextualAI/gritlm - public_training_data=False, + public_training_code="https://github.com/ContextualAI/gritlm", ) diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index f800aaa94..da265e79c 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -39,13 +39,15 @@ def instruction_template( revision="e26182b2122f4435e8b3ebecbf363990f409b45b", release_date="2024-06-15", # initial commit of hf model. n_parameters=7_613_000_000, - memory_usage=None, embed_dim=3584, license="apache-2.0", reference="https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + training_datasets=None, + max_tokens=131072, ) @@ -67,7 +69,6 @@ def instruction_template( revision="07d27e5226328010336563bc1b564a5e3436a298", release_date="2024-04-20", # initial commit of hf model. n_parameters=7_720_000_000, - memory_usage=None, embed_dim=4096, license="apache-2.0", max_tokens=32768, @@ -75,6 +76,8 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + training_datasets=None, ) @@ -96,7 +99,6 @@ def instruction_template( revision="c6c1b92f4a3e1b92b326ad29dd3c8433457df8dd", release_date="2024-07-29", # initial commit of hf model. n_parameters=1_780_000_000, - memory_usage=None, embed_dim=8960, license="apache-2.0", max_tokens=131072, @@ -104,6 +106,8 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + training_datasets=None, ) gte_small_zh = ModelMeta( @@ -118,7 +122,6 @@ def instruction_template( revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a", release_date="2023-11-08", # initial commit of hf model. n_parameters=30.3 * 1e6, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=512, @@ -126,7 +129,6 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_data=False, public_training_code=None, training_datasets=None, # Not disclosed ) @@ -143,7 +145,6 @@ def instruction_template( revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c", release_date="2023-11-08", # initial commit of hf model. n_parameters=102 * 1e6, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=512, @@ -151,7 +152,6 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_data=False, public_training_code=None, training_datasets=None, # Not disclosed ) @@ -168,7 +168,6 @@ def instruction_template( revision="64c364e579de308104a9b2c170ca009502f4f545", release_date="2023-11-08", # initial commit of hf model. n_parameters=326 * 1e6, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=512, @@ -176,7 +175,6 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_data=False, public_training_code=None, training_datasets=None, # Not disclosed ) @@ -286,7 +284,6 @@ def instruction_template( revision="ca1791e0bcc104f6db161f27de1340241b13c5a4", release_date="2024-07-20", # initial commit of hf model. n_parameters=305 * 1e6, - memory_usage=None, embed_dim=1024, license="apache-2", max_tokens=8192, @@ -294,7 +291,6 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_data=True, public_training_code=None, # couldn't find training_datasets=gte_multi_training_data, ) diff --git a/mteb/models/ibm_granite_models.py b/mteb/models/ibm_granite_models.py index c2443de23..78bad6097 100644 --- a/mteb/models/ibm_granite_models.py +++ b/mteb/models/ibm_granite_models.py @@ -33,7 +33,6 @@ revision="47db56afe692f731540413c67dd818ff492277e7", release_date="2024-12-18", n_parameters=107_000_000, - memory_usage=None, embed_dim=384, license="apache-2.0", max_tokens=512, @@ -42,6 +41,9 @@ framework=["Sentence Transformers", "PyTorch"], adapted_from=None, superseded_by=None, + public_training_code=None, + use_instructions=False, + training_datasets=None, ) granite_278m_multilingual = ModelMeta( @@ -56,7 +58,6 @@ revision="84e3546b88b0cb69f8078608a1df558020bcbf1f", release_date="2024-12-18", n_parameters=278_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -65,6 +66,9 @@ framework=["Sentence Transformers", "PyTorch"], adapted_from=None, superseded_by=None, + public_training_code=None, + use_instructions=False, + training_datasets=None, ) granite_30m_english = ModelMeta( @@ -79,7 +83,6 @@ revision="eddbb57470f896b5f8e2bfcb823d8f0e2d2024a5", release_date="2024-12-18", n_parameters=30_000_000, - memory_usage=None, embed_dim=384, license="apache-2.0", max_tokens=512, @@ -88,6 +91,9 @@ framework=["Sentence Transformers", "PyTorch"], adapted_from=None, superseded_by=None, + public_training_code=None, + use_instructions=False, + training_datasets=None, ) granite_125m_english = ModelMeta( @@ -102,7 +108,6 @@ revision="e48d3a5b47eaa18e3fe07d4676e187fd80f32730", release_date="2024-12-18", n_parameters=125_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -111,4 +116,7 @@ framework=["Sentence Transformers", "PyTorch"], adapted_from=None, superseded_by=None, + public_training_code=None, + use_instructions=False, + training_datasets=None, ) diff --git a/mteb/models/inf_models.py b/mteb/models/inf_models.py index 4670b2073..dc31adccd 100644 --- a/mteb/models/inf_models.py +++ b/mteb/models/inf_models.py @@ -17,7 +17,6 @@ revision="d2d074546028c0012b5cc6af78c4fac24896e67f", release_date="2024-12-24", # initial commit of hf model. n_parameters=7_069_121_024, - memory_usage=None, embed_dim=3584, license="apache-2.0", max_tokens=131_072, @@ -26,7 +25,6 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, adapted_from="Alibaba-NLP/gte-Qwen2-7B-instruct", - public_training_code=False, - public_training_data=False, + public_training_code=None, training_datasets=None, ) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index 0062df2ac..1dc06d564 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -81,7 +81,6 @@ def encode( revision="d6330ce98f8a0d741e781df845904c9484f00efa", release_date="2024-12-11", # first commit n_parameters=1_999_000_000, - memory_usage=None, max_tokens=131072, embed_dim=8960, license="apache-2.0", @@ -94,5 +93,4 @@ def encode( training_datasets=nvidia_training_datasets, # "In jasper model the teacher model is nvidia/NV-Embed-v2", source https://huggingface.co/infgrad/jasper_en_vision_language_v1 # "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], public_training_code=None, - public_training_data=None, ) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index 728ffaa98..265d51237 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -223,8 +223,7 @@ def encode( use_instructions=True, reference="https://huggingface.co/jinaai/jina-embeddings-v3", training_datasets=None, - public_training_code=False, - public_training_data=False, + public_training_code=None, ) @@ -235,7 +234,6 @@ def encode( revision="6e85f575bc273f1fd840a658067d0157933c83f0", release_date="2023-09-27", n_parameters=137_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=8192, @@ -246,8 +244,7 @@ def encode( superseded_by=None, adapted_from=None, training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC + public_training_code=None, ) jina_embeddings_v2_small_en = ModelMeta( @@ -257,7 +254,6 @@ def encode( revision="796cff318cdd4e5fbe8b7303a1ef8cbec36996ef", release_date="2023-09-27", n_parameters=32_700_000, - memory_usage=None, embed_dim=512, license="apache-2.0", max_tokens=8192, @@ -268,8 +264,7 @@ def encode( superseded_by=None, adapted_from=None, training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} + public_training_code=None, ) jina_embedding_b_en_v1 = ModelMeta( @@ -279,7 +274,6 @@ def encode( revision="aa0645035294a8c0607ce5bb700aba982cdff32c", release_date="2023-07-07", n_parameters=110_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -290,8 +284,7 @@ def encode( superseded_by="jinaai/jina-embeddings-v2-base-en", adapted_from=None, training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} + public_training_code=None, ) jina_embedding_s_en_v1 = ModelMeta( @@ -301,7 +294,6 @@ def encode( revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", release_date="2023-07-07", n_parameters=35_000_000, - memory_usage=None, embed_dim=512, license="apache-2.0", max_tokens=512, @@ -312,6 +304,5 @@ def encode( superseded_by="jinaai/jina-embeddings-v2-small-en", adapted_from=None, training_datasets=None, - public_training_code=False, - public_training_data=False, # uses scrapes e.g. CC and {"jinaai/negation-dataset": ["train"]} + public_training_code=None, ) diff --git a/mteb/models/linq_models.py b/mteb/models/linq_models.py index 4babbf75c..11cfa74ed 100644 --- a/mteb/models/linq_models.py +++ b/mteb/models/linq_models.py @@ -32,7 +32,6 @@ def instruction_template( revision="0c1a0b0589177079acc552433cad51d7c9132379", release_date="2024-05-29", # initial commit of hf model. n_parameters=7_110_000_000, - memory_usage=None, embed_dim=4096, license="cc-by-nc-4.0", max_tokens=32768, @@ -40,4 +39,6 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + training_datasets=None, ) diff --git a/mteb/models/llm2vec_models.py b/mteb/models/llm2vec_models.py index cbc42fe5e..a5f1a69a3 100644 --- a/mteb/models/llm2vec_models.py +++ b/mteb/models/llm2vec_models.py @@ -117,7 +117,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="baa8ebf04a1c2500e61288e7dad65e8ae42601a7", # TODO: Not sure what to put here as a model is made of two peft repos, each with a different revision release_date="2024-04-09", n_parameters=7_505_000_000, - memory_usage=None, max_tokens=8192, embed_dim=4096, license="mit", @@ -125,8 +124,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, ) @@ -144,7 +142,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="1cb7b735326d13a8541db8f57f35da5373f5e9c6", release_date="2024-04-09", n_parameters=7_505_000_000, - memory_usage=None, max_tokens=8192, embed_dim=4096, license="mit", @@ -152,8 +149,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, ) @@ -172,7 +168,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="0ae69bdd5816105778b971c3138e8f8a18eaa3ae", release_date="2024-04-09", n_parameters=7_111_000_000, - memory_usage=None, max_tokens=32768, embed_dim=4096, license="mit", @@ -180,8 +175,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, ) @@ -199,7 +193,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="2c055a5d77126c0d3dc6cd8ffa30e2908f4f45f8", release_date="2024-04-09", n_parameters=7_111_000_000, - memory_usage=None, max_tokens=32768, embed_dim=4096, license="mit", @@ -207,8 +200,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, ) @@ -226,7 +218,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="2c055a5d77126c0d3dc6cd8ffa30e2908f4f45f8", release_date="2024-04-09", n_parameters=7_111_000_000, - memory_usage=None, max_tokens=32768, embed_dim=4096, license="mit", @@ -234,8 +225,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, ) @@ -253,7 +243,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="a76944871d169ebe7c97eb921764cd063afed785", release_date="2024-04-09", n_parameters=7_111_000_000, - memory_usage=None, max_tokens=32768, embed_dim=4096, license="mit", @@ -261,8 +250,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, ) @@ -280,7 +268,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="a5943d406c6b016fef3f07906aac183cf1a0b47d", release_date="2024-04-09", n_parameters=7_111_000_000, - memory_usage=None, max_tokens=32768, embed_dim=4096, license="mit", @@ -288,8 +275,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, ) @@ -307,7 +293,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="a5943d406c6b016fef3f07906aac183cf1a0b47d", release_date="2024-04-09", n_parameters=7_111_000_000, - memory_usage=None, max_tokens=32768, embed_dim=4096, license="mit", @@ -315,7 +300,6 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["LLM2Vec", "PyTorch"], use_instructions=True, - public_training_code=True, - public_training_data=True, + public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, ) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 09e423240..5233ecec6 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -17,12 +17,10 @@ languages=["eng_Latn"], loader=None, n_parameters=7110660096, - memory_usage=None, max_tokens=32768.0, embed_dim=None, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/Haon-Chen/speed-embedding-7b-instruct", @@ -39,12 +37,10 @@ languages=[], loader=None, n_parameters=278043648, - memory_usage=None, max_tokens=514.0, embed_dim=768, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Gameselo/STS-multilingual-mpnet-base-v2", @@ -61,12 +57,10 @@ languages=None, loader=None, n_parameters=494032768, - memory_usage=None, max_tokens=131072.0, embed_dim=896, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", @@ -83,12 +77,10 @@ languages=None, loader=None, n_parameters=494032768, - memory_usage=None, max_tokens=131072.0, embed_dim=896, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-v1", @@ -105,12 +97,10 @@ languages=["eng_Latn"], loader=None, n_parameters=None, - memory_usage=None, max_tokens=None, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/Hum-Works/lodestone-base-4096-v1", @@ -169,12 +159,10 @@ languages=[], loader=None, n_parameters=2506172416, - memory_usage=None, max_tokens=8192.0, embed_dim=2048, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Jaume/gemma-2b-embeddings", @@ -191,12 +179,10 @@ languages=["eng_Latn"], loader=None, n_parameters=7241732096, - memory_usage=None, max_tokens=32768.0, embed_dim=None, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/BeastyZ/e5-R-mistral-7b", @@ -219,12 +205,10 @@ trust_remote_code=True, ), n_parameters=278043648, - memory_usage=None, max_tokens=514.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-base", @@ -246,12 +230,10 @@ trust_remote_code=True, ), n_parameters=559890432, - memory_usage=None, max_tokens=514.0, embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-large", @@ -273,12 +255,10 @@ trust_remote_code=True, ), n_parameters=117653760, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-small", @@ -295,12 +275,10 @@ languages=None, loader=None, n_parameters=17389824, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Bulbasaur", @@ -318,12 +296,10 @@ languages=None, loader=None, n_parameters=22713216, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Ivysaur", @@ -341,12 +317,10 @@ languages=None, loader=None, n_parameters=15615360, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Squirtle", @@ -364,12 +338,10 @@ languages=None, loader=None, n_parameters=15615360, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Venusaur", @@ -387,12 +359,10 @@ languages=None, loader=None, n_parameters=17389824, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Wartortle", @@ -410,12 +380,10 @@ languages=None, loader=None, n_parameters=17389824, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/gte-micro", @@ -432,12 +400,10 @@ languages=None, loader=None, n_parameters=19164288, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/gte-micro-v4", @@ -454,12 +420,10 @@ languages=["fra_Latn"], loader=None, n_parameters=559890432, - memory_usage=None, max_tokens=514.0, embed_dim=1024, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/OrdalieTech/Solon-embeddings-large-0.1", @@ -476,12 +440,10 @@ languages=["ara_Arab"], loader=None, n_parameters=135193344, - memory_usage=None, max_tokens=512.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", @@ -498,12 +460,10 @@ languages=["ara_Arab"], loader=None, n_parameters=117653760, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", @@ -522,12 +482,10 @@ languages=["ara_Arab"], loader=None, n_parameters=278043648, - memory_usage=None, max_tokens=514.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", @@ -546,12 +504,10 @@ languages=["ara_Arab"], loader=None, n_parameters=470926848, - memory_usage=None, max_tokens=512.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", @@ -570,12 +526,10 @@ languages=["ara_Arab"], loader=None, n_parameters=109486464, - memory_usage=None, max_tokens=514.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", @@ -594,12 +548,10 @@ languages=["ara_Arab"], loader=None, n_parameters=162841344, - memory_usage=None, max_tokens=512.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", @@ -616,12 +568,10 @@ languages=None, loader=None, n_parameters=None, - memory_usage=None, max_tokens=512.0, embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/consciousAI/cai-lunaris-text-embeddings", @@ -638,12 +588,10 @@ languages=None, loader=None, n_parameters=None, - memory_usage=None, max_tokens=514.0, embed_dim=768, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/consciousAI/cai-stellaris-text-embeddings", @@ -660,12 +608,10 @@ languages=None, loader=None, n_parameters=567754752, - memory_usage=None, max_tokens=8194.0, embed_dim=1024, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/bge-m3-custom-fr", @@ -682,12 +628,10 @@ languages=None, loader=None, n_parameters=1279887360, - memory_usage=None, max_tokens=2048.0, embed_dim=2048, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.2", @@ -704,12 +648,10 @@ languages=None, loader=None, n_parameters=1279887360, - memory_usage=None, max_tokens=2048.0, embed_dim=2048, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.3", @@ -726,12 +668,10 @@ languages=["fra_Latn", "eng_Latn"], loader=None, n_parameters=1279887360, - memory_usage=None, max_tokens=2048.0, embed_dim=2048, license="mit", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.4", @@ -749,12 +689,10 @@ languages=["eng_Latn"], loader=None, n_parameters=109482752, - memory_usage=None, max_tokens=512.0, embed_dim=768, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-base", @@ -771,12 +709,10 @@ languages=["eng_Latn"], loader=None, n_parameters=335142400, - memory_usage=None, max_tokens=512.0, embed_dim=1024, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-large", @@ -793,12 +729,10 @@ languages=["eng_Latn"], loader=None, n_parameters=33360512, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-small", @@ -815,12 +749,10 @@ languages=["pol_Latn"], loader=None, n_parameters=103705344, - memory_usage=None, max_tokens=512.0, embed_dim=768, license="gpl-3.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/OrlikB/KartonBERT-USE-base-v1", @@ -837,12 +769,10 @@ languages=["pol_Latn"], loader=None, n_parameters=None, - memory_usage=None, max_tokens=514.0, embed_dim=768, license="lgpl", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/OrlikB/st-polish-kartonberta-base-alpha-v1", @@ -859,12 +789,10 @@ languages=["pol_Latn"], loader=None, n_parameters=278043648, - memory_usage=None, max_tokens=514.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-base", @@ -881,12 +809,10 @@ languages=["eng_Latn"], loader=None, n_parameters=None, - memory_usage=None, max_tokens=4096.0, embed_dim=None, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/dwzhu/e5-base-4k", @@ -903,12 +829,10 @@ languages=["pol_Latn"], loader=None, n_parameters=559890432, - memory_usage=None, max_tokens=514.0, embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-large", @@ -925,12 +849,10 @@ languages=["pol_Latn"], loader=None, n_parameters=117653760, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-small", @@ -947,12 +869,10 @@ languages=["pol_Latn"], loader=None, n_parameters=124442880, - memory_usage=None, max_tokens=514.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-roberta-base", @@ -969,12 +889,10 @@ languages=["pol_Latn"], loader=None, n_parameters=434961408, - memory_usage=None, max_tokens=514.0, embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-roberta-large", @@ -1037,12 +955,10 @@ ], loader=None, n_parameters=None, - memory_usage=None, max_tokens=None, embed_dim=None, license="bigscience-bloom-rail-1.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-1b1", @@ -1105,12 +1021,10 @@ ], loader=None, n_parameters=None, - memory_usage=None, max_tokens=None, embed_dim=None, license="bigscience-bloom-rail-1.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-3b", @@ -1173,12 +1087,10 @@ ], loader=None, n_parameters=None, - memory_usage=None, max_tokens=None, embed_dim=None, license="bigscience-bloom-rail-1.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-560m", @@ -1241,12 +1153,10 @@ ], loader=None, n_parameters=None, - memory_usage=None, max_tokens=None, embed_dim=None, license="bigscience-bloom-rail-1.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-7b1", @@ -1263,12 +1173,10 @@ languages=["eng_Latn"], loader=None, n_parameters=109482240, - memory_usage=None, max_tokens=512.0, embed_dim=768, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-Embedding-v0", @@ -1285,12 +1193,10 @@ languages=["eng_Latn"], loader=None, n_parameters=22713216, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-all-MiniLM-L6-v2", @@ -1307,12 +1213,10 @@ languages=["eng_Latn"], loader=None, n_parameters=335141888, - memory_usage=None, max_tokens=512.0, embed_dim=1024, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-large-Embedding-v0", @@ -1329,12 +1233,10 @@ languages=["eng_Latn"], loader=None, n_parameters=33360000, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-small-Embedding-v0", @@ -1351,12 +1253,10 @@ languages=None, loader=None, n_parameters=None, - memory_usage=None, max_tokens=None, embed_dim=4096, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/bigscience/sgpt-bloom-7b1-msmarco", @@ -1373,12 +1273,10 @@ languages=["deu_Latn"], loader=None, n_parameters=335736320, - memory_usage=None, max_tokens=512.0, embed_dim=1024, license=None, open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/aari1995/German_Semantic_STS_V2", @@ -1396,12 +1294,10 @@ languages=["eng_Latn"], loader=None, n_parameters=33360000, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/abhinand/MedEmbed-small-v0.1", @@ -1424,12 +1320,10 @@ languages=["eng_Latn"], loader=None, n_parameters=33360000, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/NoInstruct-small-Embedding-v0", @@ -1446,12 +1340,10 @@ languages=["eng_Latn"], loader=None, n_parameters=22713216, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/brahmairesearch/slx-v0.1", @@ -1468,12 +1360,10 @@ languages=None, loader=None, n_parameters=None, - memory_usage=None, max_tokens=514.0, embed_dim=768, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/deepfile/embedder-100p", @@ -1490,12 +1380,10 @@ languages=["rus_Cyrl"], loader=None, n_parameters=359026688, - memory_usage=None, max_tokens=8194.0, embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/deepvk/USER-bge-m3", @@ -1523,12 +1411,10 @@ languages=["eng_Latn"], loader=None, n_parameters=None, - memory_usage=None, max_tokens=512.0, embed_dim=None, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/infgrad/stella-base-en-v2", @@ -1545,12 +1431,10 @@ languages=None, loader=None, n_parameters=98688000, - memory_usage=None, max_tokens=512.0, embed_dim=1024, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/malenia1/ternary-weight-embedding", @@ -1567,12 +1451,10 @@ languages=["ara_Arab", "eng_Latn"], loader=None, n_parameters=559890432, - memory_usage=None, max_tokens=514.0, embed_dim=1024, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/omarelshehy/arabic-english-sts-matryoshka", @@ -1599,12 +1481,10 @@ release_date="2024-09-04", languages=["zho_Hans", "eng_Latn"], n_parameters=2724880896, - memory_usage=None, max_tokens=512.0, embed_dim=2304, license=None, open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/openbmb/MiniCPM-Embedding", @@ -1631,12 +1511,10 @@ ], loader=None, n_parameters=117654272, - memory_usage=None, max_tokens=512.0, embed_dim=384, license="apache-2.0", open_weights=True, - public_training_data=True, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/shibing624/text2vec-base-multilingual", @@ -1654,12 +1532,10 @@ languages=["ara_Arab", "eng_Latn"], loader=None, n_parameters=135193344, - memory_usage=None, max_tokens=512.0, embed_dim=768, license="apache-2.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/silma-ai/silma-embeddding-matryoshka-v0.1", @@ -1676,12 +1552,10 @@ languages=["eng_Latn"], loader=None, n_parameters=7110660096, - memory_usage=None, max_tokens=32768.0, embed_dim=4096, license="mit", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch"], reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", @@ -1698,12 +1572,10 @@ languages=["zho_Hans"], loader=None, n_parameters=None, # Not visible on repo - memory_usage=None, max_tokens=512, embed_dim=128, license="apache-2", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/DMetaSoul/sbert-chinese-general-v1", @@ -1724,12 +1596,10 @@ languages=["zho_Hans"], loader=None, n_parameters=74.2 * 1e6, - memory_usage=None, max_tokens=1024, embed_dim=768, license="apache-2", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/DMetaSoul/Dmeta-embedding-zh-small/", @@ -1745,12 +1615,10 @@ languages=["zho_Hans"], loader=None, n_parameters=326 * 1e6, - memory_usage=None, max_tokens=512, embed_dim=1024, license="not specified", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/lier007/xiaobu-embedding", @@ -1767,12 +1635,10 @@ languages=["zho_Hans"], loader=None, n_parameters=326 * 1e6, - memory_usage=None, max_tokens=512, embed_dim=768, license="not specified", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/lier007/xiaobu-embedding-v2", @@ -1789,12 +1655,10 @@ languages=["zho_Hans"], loader=None, n_parameters=326 * 1e6, - memory_usage=None, max_tokens=512, embed_dim=1024, license="not specified", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Classical/Yinka", @@ -1811,12 +1675,10 @@ languages=["zho_Hans"], loader=None, n_parameters=326 * 1e6, - memory_usage=None, max_tokens=512, embed_dim=768, license="cc-by-nc-4.0", open_weights=True, - public_training_data=False, public_training_code=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Classical/Yinka", diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 1a58bbf8e..afbf9df62 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -75,8 +75,7 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code=True, # https://github.com/MinishLab/model2vec - public_training_data=False, + public_training_code="https://github.com/MinishLab/model2vec", # ) @@ -101,8 +100,7 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code=True, # https://github.com/MinishLab/model2vec - public_training_data=False, + public_training_code="https://github.com/MinishLab/model2vec", ) m2v_base_output = ModelMeta( @@ -126,8 +124,7 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code=True, # https://github.com/MinishLab/model2vec - public_training_data=False, + public_training_code="https://github.com/MinishLab/model2vec", ) m2v_multilingual_output = ModelMeta( @@ -151,8 +148,7 @@ def encode( adapted_from="sentence-transformers/LaBSE", superseded_by=None, training_datasets=None, - public_training_code=True, # https://github.com/MinishLab/model2vec - public_training_data=False, + public_training_code="https://github.com/MinishLab/model2vec", ) potion_base_2m = ModelMeta( @@ -176,8 +172,7 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code=True, # https://github.com/MinishLab/model2vec - public_training_data=False, + public_training_code="https://github.com/MinishLab/model2vec", ) potion_base_4m = ModelMeta( @@ -201,8 +196,7 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code=True, # https://github.com/MinishLab/model2vec - public_training_data=False, + public_training_code="https://github.com/MinishLab/model2vec", ) potion_base_8m = ModelMeta( @@ -226,6 +220,5 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code=True, # https://github.com/MinishLab/model2vec - public_training_data=False, + public_training_code="https://github.com/MinishLab/model2vec", ) diff --git a/mteb/models/moka_models.py b/mteb/models/moka_models.py index cf9b96f88..d3943d78d 100644 --- a/mteb/models/moka_models.py +++ b/mteb/models/moka_models.py @@ -86,7 +86,6 @@ revision="764b537a0e50e5c7d64db883f2d2e051cbe3c64c", release_date="2023-06-06", # first commit n_parameters=102 * 1e6, - memory_usage=None, embed_dim=768, # They don't give a specific license but commercial use is not allowed license="unspecified-noncommercial", @@ -97,8 +96,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # Not published - public_training_data=False, # They haven't published it yet + public_training_code=None, # Not published training_datasets=m3e_dataset, ) @@ -109,7 +107,6 @@ revision="44c696631b2a8c200220aaaad5f987f096e986df", release_date="2023-06-02", # first commit n_parameters=None, # Can't be seen on HF page - memory_usage=None, embed_dim=512, # They don't give a specific license but commercial use is not allowed license="unspecified-noncommercial", @@ -120,8 +117,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # Not published - public_training_data=False, # They haven't published it yet + public_training_code=None, # Not published training_datasets=m3e_dataset, ) @@ -133,7 +129,6 @@ revision="12900375086c37ba5d83d1e417b21dc7d1d1f388", release_date="2023-06-21", # first commit n_parameters=None, # Can't be seen on HF page - memory_usage=None, embed_dim=768, # They don't give a specific license but commercial use is not allowed license="unspecified-noncommercial", @@ -144,7 +139,6 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # Not published - public_training_data=False, # They haven't published it yet + public_training_code=None, # Not published training_datasets=m3e_dataset, ) diff --git a/mteb/models/mxbai_models.py b/mteb/models/mxbai_models.py index 5dfb9dc42..04978a190 100644 --- a/mteb/models/mxbai_models.py +++ b/mteb/models/mxbai_models.py @@ -19,7 +19,6 @@ revision="990580e27d329c7408b3741ecff85876e128e203", release_date="2024-03-07", # initial commit of hf model. n_parameters=335_000_000, - memory_usage=None, max_tokens=512, embed_dim=1024, license="apache-2.0", @@ -27,4 +26,6 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_code=None, + training_datasets=None, ) diff --git a/mteb/models/no_instruct_sentence_models.py b/mteb/models/no_instruct_sentence_models.py index 019cfe7e0..a0596b9bd 100644 --- a/mteb/models/no_instruct_sentence_models.py +++ b/mteb/models/no_instruct_sentence_models.py @@ -90,7 +90,6 @@ def encode( # type: ignore revision="b38747000553d8268915c95a55fc87e707c9aadd", release_date="2024-05-01", # first commit n_parameters=33_400_000, - memory_usage=None, max_tokens=512, embed_dim=384, license="mit", @@ -100,4 +99,6 @@ def encode( # type: ignore use_instructions=False, adapted_from=None, superseded_by=None, + public_training_code=None, + training_datasets=None, ) diff --git a/mteb/models/nomic_models.py b/mteb/models/nomic_models.py index aa6989941..5d9da7b59 100644 --- a/mteb/models/nomic_models.py +++ b/mteb/models/nomic_models.py @@ -117,7 +117,6 @@ def encode( # type: ignore revision="b0753ae76394dd36bcfb912a46018088bca48be0", release_date="2024-02-10", # first commit n_parameters=137_000_000, - memory_usage=None, max_tokens=8192, embed_dim=768, license="apache-2.0", @@ -127,6 +126,8 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, + public_training_code=None, + training_datasets=None, ) nomic_embed_v1 = ModelMeta( @@ -143,7 +144,6 @@ def encode( # type: ignore revision="0759316f275aa0cb93a5b830973843ca66babcf5", release_date="2024-01-31", # first commit n_parameters=None, - memory_usage=None, max_tokens=8192, embed_dim=768, license="apache-2.0", @@ -153,6 +153,8 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by="nomic-ai/nomic-embed-text-v1.5", + public_training_code=None, + training_datasets=None, ) nomic_embed_v1_ablated = ModelMeta( @@ -169,7 +171,6 @@ def encode( # type: ignore revision="7d948905c5d5d3874fa55a925d68e49dbf411e5f", release_date="2024-01-15", # first commit n_parameters=None, - memory_usage=None, max_tokens=8192, embed_dim=768, license="apache-2.0", @@ -179,6 +180,8 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, + public_training_code=None, + training_datasets=None, ) @@ -196,7 +199,6 @@ def encode( # type: ignore revision="b53d557b15ae63852847c222d336c1609eced93c", release_date="2024-01-15", # first commit n_parameters=None, - memory_usage=None, max_tokens=8192, embed_dim=768, license="apache-2.0", @@ -206,6 +208,8 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, + public_training_code=None, + training_datasets=None, ) nomic_modern_bert_embed = ModelMeta( @@ -224,7 +228,6 @@ def encode( # type: ignore revision="5960f1566fb7cb1adf1eb6e816639cf4646d9b12", release_date="2024-12-29", n_parameters=149_000_000, - memory_usage=None, max_tokens=8192, embed_dim=768, license="apache-2.0", @@ -234,4 +237,6 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, + public_training_code=None, + training_datasets=None, ) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 6bf4e041a..1f345a62b 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -132,7 +132,6 @@ def encode( revision="7604d305b621f14095a1aa23d351674c2859553a", release_date="2024-09-09", # initial commit of hf model. n_parameters=7_850_000_000, - memory_usage=None, embed_dim=4096, license="cc-by-nc-4.0", max_tokens=32768, @@ -142,7 +141,6 @@ def encode( use_instructions=True, training_datasets=nvidia_training_datasets, public_training_code=None, - public_training_data=True, ) NV_embed_v1 = ModelMeta( @@ -157,7 +155,6 @@ def encode( revision="570834afd5fef5bf3a3c2311a2b6e0a66f6f4f2c", release_date="2024-09-13", # initial commit of hf model. n_parameters=7_850_000_000, - memory_usage=None, embed_dim=4096, license="cc-by-nc-4.0", max_tokens=32768, @@ -167,5 +164,4 @@ def encode( use_instructions=True, training_datasets=nvidia_training_datasets, public_training_code=None, - public_training_data=True, ) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 619a4a747..863c9d782 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -130,14 +130,12 @@ def _to_numpy(self, embedding_response) -> np.ndarray: embed_dim=1536, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://openai.com/index/new-embedding-models-and-api-updates/", similarity_fn_name="cosine", framework=["API"], use_instructions=False, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, ) text_embedding_3_large = ModelMeta( @@ -158,10 +156,10 @@ def _to_numpy(self, embedding_response) -> np.ndarray: framework=["API"], use_instructions=False, n_parameters=None, - memory_usage=None, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, + license=None, + similarity_fn_name=None, ) text_embedding_ada_002 = ModelMeta( name="openai/text-embedding-ada-002", @@ -181,8 +179,8 @@ def _to_numpy(self, embedding_response) -> np.ndarray: framework=["API"], use_instructions=False, n_parameters=None, - memory_usage=None, - public_training_data=False, # assumed - public_training_code=False, # assumed + public_training_code=None, # assumed training_datasets=None, + license=None, + similarity_fn_name=None, ) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index ea0fa1524..ad93efb31 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -212,22 +212,39 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: frameworks.append("Sentence Transformers") return ModelMeta( name=model_name, - revision=None, + revision=card_data.get("base_model_revision", None), # TODO release_date=None, # TODO: We need a mapping between conflicting language codes languages=None, license=card_data.get("license", None), framework=frameworks, - public_training_data=bool(card_data.get("datasets", None)), + training_datasets=card_data.get("datasets", None), + similarity_fn_name=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + open_weights=True, + public_training_code=None, + use_instructions=None, ) except Exception as e: logger.warning(f"Failed to extract metadata from model: {e}.") return ModelMeta( - name=None, + name=model_name, revision=None, languages=None, release_date=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=[], ) @@ -250,6 +267,14 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe languages=languages, framework=["Sentence Transformers"], similarity_fn_name=model.similarity_fn_name, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + use_instructions=None, + training_datasets=None, ) except AttributeError as e: logger.warning( @@ -260,5 +285,15 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe revision=None, languages=None, release_date=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=[], ) return meta diff --git a/mteb/models/piccolo_models.py b/mteb/models/piccolo_models.py index 17ea1fc2a..bb92b5567 100644 --- a/mteb/models/piccolo_models.py +++ b/mteb/models/piccolo_models.py @@ -11,7 +11,6 @@ revision="47c0a63b8f667c3482e05b2fd45577bb19252196", release_date="2023-09-04", # first commit n_parameters=None, # can't see on model card - memory_usage=None, embed_dim=768, license="mit", max_tokens=512, @@ -21,8 +20,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, - public_training_data=False, + public_training_code=None, training_datasets=None, # They don't specify ) @@ -34,7 +32,6 @@ revision="05948c1d889355936bdf9db7d30df57dd78d25a3", release_date="2024-04-22", # first commit n_parameters=None, # we don't know because they removed the model - memory_usage=None, embed_dim=1024, license="not specified", max_tokens=512, @@ -44,7 +41,6 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, - public_training_data=False, + public_training_code=None, training_datasets=None, # They don't say ) diff --git a/mteb/models/promptriever_models.py b/mteb/models/promptriever_models.py index 7fc94cd36..a7066817a 100644 --- a/mteb/models/promptriever_models.py +++ b/mteb/models/promptriever_models.py @@ -56,7 +56,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="01c7f73d771dfac7d292323805ebc428287df4f9-30b14e3813c0fa45facfd01a594580c3fe5ecf23", # base-peft revision release_date="2024-09-15", n_parameters=7_000_000, - memory_usage=None, max_tokens=4096, embed_dim=4096, license="apache-2.0", @@ -65,6 +64,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["PyTorch", "Tevatron"], use_instructions=True, + public_training_code=None, ) promptriever_llama3 = ModelMeta( @@ -82,7 +82,6 @@ def loader_inner(**kwargs: Any) -> Encoder: training_datasets={"samaya-ai/msmarco-w-instructions": ["train"]}, release_date="2024-09-15", n_parameters=8_000_000, - memory_usage=None, max_tokens=8192, embed_dim=4096, license="apache-2.0", @@ -90,6 +89,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["PyTorch", "Tevatron"], use_instructions=True, + public_training_code=None, ) @@ -107,7 +107,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="5206a32e0bd3067aef1ce90f5528ade7d866253f-8b677258615625122c2eb7329292b8c402612c21", # base-peft revision release_date="2024-09-15", n_parameters=8_000_000, - memory_usage=None, max_tokens=8192, embed_dim=4096, training_datasets={"samaya-ai/msmarco-w-instructions": ["train"]}, @@ -116,6 +115,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["PyTorch", "Tevatron"], use_instructions=True, + public_training_code=None, ) promptriever_mistral_v1 = ModelMeta( @@ -133,7 +133,6 @@ def loader_inner(**kwargs: Any) -> Encoder: release_date="2024-09-15", n_parameters=7_000_000, training_datasets={"samaya-ai/msmarco-w-instructions": ["train"]}, - memory_usage=None, max_tokens=4096, embed_dim=4096, license="apache-2.0", @@ -141,4 +140,5 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["PyTorch", "Tevatron"], use_instructions=True, + public_training_code=None, ) diff --git a/mteb/models/repllama_models.py b/mteb/models/repllama_models.py index a1f1ba727..5ae4c0d8c 100644 --- a/mteb/models/repllama_models.py +++ b/mteb/models/repllama_models.py @@ -142,7 +142,6 @@ def loader_inner(**kwargs: Any) -> Encoder: release_date="2023-10-11", training_datasets={"Tevatron/msmarco-passage-aug": ["train"]}, n_parameters=7_000_000, - memory_usage=None, max_tokens=4096, embed_dim=4096, license="apache-2.0", @@ -150,6 +149,7 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["PyTorch", "Tevatron"], use_instructions=True, + public_training_code=None, ) @@ -168,7 +168,6 @@ def loader_inner(**kwargs: Any) -> Encoder: revision="01c7f73d771dfac7d292323805ebc428287df4f9-ad5c1d0938a1e02954bcafb4d811ba2f34052e71", # base-peft revision release_date="2024-09-15", n_parameters=7_000_000, - memory_usage=None, max_tokens=4096, embed_dim=4096, license="apache-2.0", @@ -176,4 +175,6 @@ def loader_inner(**kwargs: Any) -> Encoder: similarity_fn_name="cosine", framework=["PyTorch", "Tevatron"], use_instructions=True, + public_training_code=None, + training_datasets=None, ) diff --git a/mteb/models/rerankers_custom.py b/mteb/models/rerankers_custom.py index e8bb483a3..5609fdf83 100644 --- a/mteb/models/rerankers_custom.py +++ b/mteb/models/rerankers_custom.py @@ -204,6 +204,15 @@ def loader_inner(**kwargs: Any) -> Encoder: open_weights=True, revision="0a97706f3827389da43b83348d5d18c9d53876fa", release_date="2020-05-28", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["Sentence Transformers", "PyTorch"], ) # languages unclear: https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual/discussions/28 @@ -219,6 +228,15 @@ def loader_inner(**kwargs: Any) -> Encoder: open_weights=True, revision="126747772a932960028d9f4dc93bd5d9c4869be4", release_date="2024-09-26", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["Sentence Transformers", "PyTorch"], ) bge_reranker_v2_m3 = ModelMeta( @@ -266,4 +284,13 @@ def loader_inner(**kwargs: Any) -> Encoder: open_weights=True, revision="953dc6f6f85a1b2dbfca4c34a2796e7dde08d41e", release_date="2024-06-24", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["Sentence Transformers", "PyTorch"], ) diff --git a/mteb/models/rerankers_monot5_based.py b/mteb/models/rerankers_monot5_based.py index 6dfae3b0a..5bc50bad7 100644 --- a/mteb/models/rerankers_monot5_based.py +++ b/mteb/models/rerankers_monot5_based.py @@ -296,6 +296,15 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="77f8e3f7b1eb1afe353aa21a7c3a2fc8feca702e", release_date="2022-03-28", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) monot5_base = ModelMeta( @@ -310,6 +319,15 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="f15657ab3d2a5dd0b9a30c8c0b6a0a73c9cb5884", release_date="2022-03-28", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) monot5_large = ModelMeta( @@ -324,6 +342,15 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="48cfad1d8dd587670393f27ee8ec41fde63e3d98", release_date="2022-03-28", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) monot5_3b = ModelMeta( @@ -338,6 +365,15 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="bc0c419a438c81f592f878ce32430a1823f5db6c", release_date="2022-03-28", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) flant5_base = ModelMeta( @@ -364,6 +400,14 @@ def get_prediction_tokens(self, *args, **kwargs): "quasc": ["train"], "qed": ["train"], }, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) flant5_large = ModelMeta( @@ -390,6 +434,14 @@ def get_prediction_tokens(self, *args, **kwargs): "quasc": ["train"], "qed": ["train"], }, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) flant5_xl = ModelMeta( @@ -416,6 +468,14 @@ def get_prediction_tokens(self, *args, **kwargs): "quasc": ["train"], "qed": ["train"], }, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) flant5_xxl = ModelMeta( @@ -442,6 +502,14 @@ def get_prediction_tokens(self, *args, **kwargs): "quasc": ["train"], "qed": ["train"], }, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) @@ -457,6 +525,15 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="01c7f73d771dfac7d292323805ebc428287df4f9", release_date="2023-07-18", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) llama2_7b_chat = ModelMeta( @@ -471,6 +548,15 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="f5db02db724555f92da89c216ac04704f23d4590", release_date="2023-07-18", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) mistral_7b = ModelMeta( @@ -485,6 +571,15 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="3ad372fc79158a2148299e3318516c786aeded6c", release_date="2023-12-11", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) followir_7b = ModelMeta( @@ -500,6 +595,14 @@ def get_prediction_tokens(self, *args, **kwargs): revision="4d25d437e38b510c01852070c0731e8f6e1875d1", release_date="2024-04-29", training_datasets={"jhu-clsp/FollowIR-train": ["train"]}, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) @@ -620,6 +723,14 @@ def get_prediction_tokens(self, *args, **kwargs): revision="cc0a949b9f21efcaba45c8cabb998ad02ce8d4e7", release_date="2022-01-05", training_datasets={"msmarco": ["train"]}, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + framework=["PyTorch"], ) mt5_13b_mmarco_100k = ModelMeta( @@ -634,4 +745,13 @@ def get_prediction_tokens(self, *args, **kwargs): open_weights=True, revision="e1a4317e102a525ea9e16745ad21394a4f1bffbc", release_date="2022-11-04", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["PyTorch"], ) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index 6bca544b1..d8c7e8451 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -15,7 +15,6 @@ revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3", release_date="2021-10-28", n_parameters=29_400_000, - memory_usage=None, embed_dim=312, license="mit", max_tokens=2048, @@ -23,6 +22,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) rubert_tiny = ModelMeta( @@ -32,7 +33,6 @@ revision="5441c5ea8026d4f6d7505ec004845409f1259fb1", release_date="2021-05-24", n_parameters=29_400_000, - memory_usage=None, embed_dim=312, license="mit", max_tokens=2048, @@ -40,6 +40,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) sbert_large_nlu_ru = ModelMeta( @@ -49,7 +51,6 @@ revision="af977d5dfa46a3635e29bf0ef383f2df2a08d47a", release_date="2020-11-20", n_parameters=427_000_000, - memory_usage=None, embed_dim=1024, license="mit", max_tokens=512, # best guess @@ -57,6 +58,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) sbert_large_mt_nlu_ru = ModelMeta( @@ -66,7 +69,6 @@ revision="05300876c2b83f46d3ddd422a7f17e45cf633bb0", release_date="2021-05-18", n_parameters=427_000_000, - memory_usage=None, embed_dim=1024, license="Not specified", max_tokens=512, # best guess @@ -74,6 +76,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) user_base_ru = ModelMeta( @@ -89,7 +93,6 @@ revision="436a489a2087d61aa670b3496a9915f84e46c861", release_date="2024-06-10", n_parameters=427_000_000, - memory_usage=None, embed_dim=1024, license="Not specified", max_tokens=512, # best guess @@ -120,6 +123,7 @@ # "bragovo/dsum_ru": ["train"], # "CarlBrendt/Summ_Dialog_News": ["train"], }, + public_training_code=None, ) deberta_v1_ru = ModelMeta( @@ -129,7 +133,6 @@ revision="bdd30b0e19757e6940c92c7aff19e8fc0a60dff4", release_date="2023-02-07", n_parameters=124_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -137,6 +140,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) rubert_base_cased = ModelMeta( @@ -146,7 +151,6 @@ revision="4036cab694767a299f2b9e6492909664d9414229", release_date="2020-03-04", n_parameters=1280_000_000, - memory_usage=None, embed_dim=768, license="Not specified", max_tokens=512, # best guess @@ -154,6 +158,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) distilrubert_small_cased_conversational = ModelMeta( @@ -163,7 +169,6 @@ revision="e348066b4a7279b97138038299bddc6580a9169a", release_date="2022-06-28", n_parameters=107_000_000, - memory_usage=None, embed_dim=768, license="Not specified", max_tokens=512, @@ -171,6 +176,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) rubert_base_cased_sentence = ModelMeta( @@ -180,7 +187,6 @@ revision="78b5122d6365337dd4114281b0d08cd1edbb3bc8", release_date="2020-03-04", n_parameters=107_000_000, - memory_usage=None, embed_dim=768, license="Not specified", max_tokens=512, @@ -188,6 +194,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) labse_en_ru = ModelMeta( @@ -197,7 +205,6 @@ revision="cf0714e606d4af551e14ad69a7929cd6b0da7f7e", release_date="2021-06-10", n_parameters=129_000_000, - memory_usage=None, embed_dim=768, license="Not specified", max_tokens=512, @@ -205,6 +212,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, + training_datasets=None, ) rubert_tiny_turbo = ModelMeta( @@ -214,7 +223,6 @@ revision="8ce0cf757446ce9bb2d5f5a4ac8103c7a1049054", release_date="2024-06-21", n_parameters=129_000_000, - memory_usage=None, embed_dim=312, license="mit", max_tokens=512, @@ -222,6 +230,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + public_training_code=None, training_datasets=None, # source model in unknown # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, ) @@ -233,7 +242,6 @@ revision="1940b046c6b5e125df11722b899130329d0a46da", release_date="2024-06-27", n_parameters=129_000_000, - memory_usage=None, embed_dim=312, license="mit", max_tokens=512, @@ -243,6 +251,7 @@ use_instructions=False, training_datasets=None, # source model in unknown # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + public_training_code=None, ) @@ -264,4 +273,12 @@ revision="89fb1651989adbb1cfcfdedafd7d102951ad0555", release_date="2024-07-29", use_instructions=True, + n_parameters=404_000_000, + max_tokens=514, + embed_dim=1024, + license="mit", + similarity_fn_name="cosine", + public_training_code=None, + training_datasets=None, + framework=["Sentence Transformers", "PyTorch"], ) diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index 18db09a2b..4d4a60b62 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -32,7 +32,6 @@ def instruction_template( revision="91762139d94ed4371a9fa31db5551272e0b83818", release_date="2024-06-14", # initial commit of hf model. n_parameters=7_110_000_000, - memory_usage=None, embed_dim=4096, license="cc-by-nc-4.0", max_tokens=32768, @@ -41,8 +40,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, adapted_from="intfloat/e5-mistral-7b-instruct", - public_training_code=False, - public_training_data=False, + public_training_code=None, training_datasets={ # inherits from e5 "MSMARCO": ["train"], "MSMARCOHardNegatives": ["train"], @@ -73,7 +71,6 @@ def instruction_template( revision="938c560d1c236aa563b2dbdf084f28ab28bccb11", release_date="2024-01-24", # initial commit of hf model. n_parameters=7_110_000_000, - memory_usage=None, embed_dim=4096, license="cc-by-nc-4.0", max_tokens=32768, @@ -81,8 +78,7 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=False, - public_training_data=False, + public_training_code=None, training_datasets={ # inherits from e5 "MSMARCO": ["train"], "MSMARCOHardNegatives": ["train"], diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index f8b01c6ea..fa48ae7cc 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -101,7 +101,6 @@ revision="8b3219a92973c328a8e22fadcfa821b5dc75636a", release_date="2021-08-30", n_parameters=22_700_000, - memory_usage=None, embed_dim=384, license="apache-2.0", max_tokens=256, @@ -112,8 +111,7 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, - public_training_code=True, - public_training_data=True, + public_training_code=None, ) all_MiniLM_L12_v2 = ModelMeta( @@ -123,7 +121,6 @@ revision="364dd28d28dcd3359b537f3cf1f5348ba679da62", release_date="2021-08-30", n_parameters=33_400_000, - memory_usage=None, embed_dim=384, license="apache-2.0", max_tokens=256, @@ -134,8 +131,7 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, - public_training_code=True, - public_training_data=True, + public_training_code=None, ) paraphrase_multilingual_MiniLM_L12_v2 = ModelMeta( @@ -145,7 +141,6 @@ revision="bf3bf13ab40c3157080a7ab344c831b9ad18b5eb", release_date="2019-11-01", # release date of paper n_parameters=118_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -156,8 +151,7 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) - public_training_code=True, - public_training_data=True, + public_training_code=None, ) paraphrase_multilingual_mpnet_base_v2 = ModelMeta( @@ -167,7 +161,6 @@ revision="79f2382ceacceacdf38563d7c5d16b9ff8d725d6", release_date="2019-11-01", # release date of paper n_parameters=278_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -189,8 +182,7 @@ # "flickr30k-captions": flickr_train_dataset, # "yahoo-answers": yahoo_answers_train_dataset, # "stack-exchange": stack_exchange_train_dataset, - public_training_code=True, - public_training_data=True, + public_training_code=None, ) labse = ModelMeta( @@ -200,7 +192,6 @@ revision="e34fab64a3011d2176c99545a93d5cbddc9a91b7", release_date="2019-11-01", # release date of paper n_parameters=471_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -211,8 +202,7 @@ superseded_by=None, adapted_from=None, training_datasets=None, # scraped and mined webdata including CC, wiki, see section 3.1 https://aclanthology.org/2022.acl-long.62.pdf - public_training_code=True, # https://www.kaggle.com/models/google/labse/tensorFlow2/labse/2?tfhub-redirect=true - public_training_data=False, + public_training_code="https://www.kaggle.com/models/google/labse/tensorFlow2/labse/2?tfhub-redirect=true", ) multi_qa_MiniLM_L6_cos_v1 = ModelMeta( @@ -222,7 +212,6 @@ revision="b207367332321f8e44f96e224ef15bc607f4dbf0", release_date="2021-08-30", n_parameters=22_700_000, - memory_usage=None, embed_dim=384, license="apache-2.0", max_tokens=512, @@ -234,7 +223,6 @@ adapted_from="nreimers/MiniLM-L6-H384-uncased", training_datasets=sent_trf_training_dataset, # assumed public_training_code=None, - public_training_data=None, ) all_mpnet_base_v2 = ModelMeta( @@ -244,7 +232,6 @@ revision="9a3225965996d404b775526de6dbfe85d3368642", release_date="2021-08-30", n_parameters=109_000_000, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=384, @@ -255,8 +242,7 @@ superseded_by=None, adapted_from=None, training_datasets=sent_trf_training_dataset, - public_training_code=True, - public_training_data=True, + public_training_code=None, ) @@ -267,7 +253,6 @@ revision="98f70f14cdf12d7ea217ed2fd4e808b0195f1e7e", release_date="2024-11-10", n_parameters=272_000_000, - memory_usage=None, embed_dim=1024, license="apache-2.0", max_tokens=2048, @@ -288,4 +273,5 @@ # "sentence-transformers/quora-duplicates": ["train"], # "sentence-transformers/natural-questions": ["train"], }, + public_training_code=None, ) diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 1e04b4116..44aa1f860 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -29,8 +29,7 @@ framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_400M_v5", training_datasets=None, - public_training_data=False, # currently not released - public_training_code=False, + public_training_code=None, ) stella_en_1_5b = ModelMeta( @@ -56,8 +55,7 @@ framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", training_datasets=None, - public_training_data=False, # currently not released - public_training_code=False, + public_training_code=None, ) stella_large_zh_v3_1792d = ModelMeta( @@ -67,7 +65,6 @@ revision="d5d39eb8cd11c80a63df53314e59997074469f09", release_date="2024-02-17", n_parameters=None, # can't see on model card - memory_usage=None, embed_dim=1792, license="not specified", max_tokens=512, @@ -77,8 +74,7 @@ use_instructions=False, superseded_by="dunzhang/stella-mrl-large-zh-v3.5-1792d", adapted_from=None, - public_training_code=False, - public_training_data=True, + public_training_code=None, training_datasets={ # Not in MTEB: # - infgrad/dialogue_rewrite_llm @@ -93,7 +89,6 @@ revision="82254892a0fba125aa2abf3a4800d2dd12821343", release_date="2024-02-17", n_parameters=None, # can't see on model card - memory_usage=None, embed_dim=1792, license="mit", max_tokens=512, @@ -103,8 +98,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, - public_training_data=True, + public_training_code=None, training_datasets={ # Not in MTEB: # - infgrad/dialogue_rewrite_llm @@ -120,7 +114,6 @@ revision="17bb1c32a93a8fc5f6fc9e91d5ea86da99983cfe", release_date="2024-02-27", n_parameters=326 * 1e6, - memory_usage=None, embed_dim=1792, license="mit", max_tokens=512, @@ -130,8 +123,7 @@ use_instructions=False, superseded_by=None, adapted_from="dunzhang/stella-large-zh-v3-1792d", - public_training_code=False, - public_training_data=True, + public_training_code=None, training_datasets=None, # Not specified ) @@ -142,7 +134,6 @@ revision="b1075144f440ab4409c05622c1179130ebd57d03", release_date="2024-06-04", n_parameters=326 * 1e6, - memory_usage=None, embed_dim=1792, license="mit", max_tokens=512, @@ -152,8 +143,7 @@ use_instructions=False, superseded_by=None, adapted_from="dunzhang/stella-mrl-large-zh-v3.5-1792d", - public_training_code=False, - public_training_data=True, + public_training_code=None, training_datasets={ # It's a bit unclear what they have trained on to be honest, because they don't list all # And they also have some rather cryptic description of their training procedure, but at diff --git a/mteb/models/text2vec_models.py b/mteb/models/text2vec_models.py index e26108e0a..12322e69e 100644 --- a/mteb/models/text2vec_models.py +++ b/mteb/models/text2vec_models.py @@ -12,7 +12,6 @@ revision="183bb99aa7af74355fb58d16edf8c13ae7c5433e", release_date="2022-01-23", n_parameters=102 * 1e6, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -22,8 +21,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # Couldn't find it - public_training_data=True, + public_training_code=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB @@ -39,7 +37,6 @@ revision="e90c150a9c7fb55a67712a766d6820c55fb83cdd", release_date="2023-06-19", n_parameters=118 * 1e6, - memory_usage=None, embed_dim=768, license="apache-2.0", max_tokens=512, @@ -49,8 +46,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # Couldn't find it - public_training_data=True, + public_training_code=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB @@ -82,7 +78,6 @@ # So probably best not to. loader=None, n_parameters=118 * 1e6, - memory_usage=None, embed_dim=384, license="apache-2.0", max_tokens=256, @@ -92,8 +87,7 @@ use_instructions=False, superseded_by=None, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", - public_training_code=False, # Couldn't find it - public_training_data=True, + public_training_code=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB diff --git a/mteb/models/uae_models.py b/mteb/models/uae_models.py index ffdaa29f7..bd8be4869 100644 --- a/mteb/models/uae_models.py +++ b/mteb/models/uae_models.py @@ -82,6 +82,5 @@ def encode( "NLI": [], "SNLI": [], }, - public_training_data=True, - public_training_code=True, + public_training_code=None, ) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index 12925b235..a98bc041b 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -151,15 +151,13 @@ def _batched_encode( embed_dim=1024, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2024/05/05/voyage-large-2-instruct-instruction-tuned-and-rank-1-on-mteb/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) voyage_finance_2 = ModelMeta( @@ -176,15 +174,13 @@ def _batched_encode( embed_dim=1024, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2024/06/03/domain-specific-embeddings-finance-edition-voyage-finance-2/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) voyage_law_2 = ModelMeta( @@ -201,15 +197,13 @@ def _batched_encode( embed_dim=1024, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2024/04/15/domain-specific-embeddings-and-retrieval-legal-edition-voyage-law-2/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) voyage_code_2 = ModelMeta( @@ -226,15 +220,13 @@ def _batched_encode( embed_dim=1536, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2024/01/23/voyage-code-2-elevate-your-code-retrieval/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) voyage_large_2 = ModelMeta( @@ -251,15 +243,13 @@ def _batched_encode( embed_dim=1536, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) voyage_2 = ModelMeta( @@ -276,15 +266,13 @@ def _batched_encode( embed_dim=1024, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, - public_training_code=False, + public_training_code=None, ) voyage_multilingual_2 = ModelMeta( name="voyageai/voyage-multilingual-2", @@ -300,15 +288,13 @@ def _batched_encode( embed_dim=1024, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2024/06/10/voyage-multilingual-2-multilingual-embedding-model/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) voyage_3 = ModelMeta( @@ -325,15 +311,13 @@ def _batched_encode( embed_dim=1024, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2024/09/18/voyage-3/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) voyage_3_lite = ModelMeta( @@ -350,13 +334,11 @@ def _batched_encode( embed_dim=512, open_weights=False, n_parameters=None, - memory_usage=None, license=None, reference="https://blog.voyageai.com/2024/09/18/voyage-3/", similarity_fn_name="cosine", framework=["API"], use_instructions=True, training_datasets=None, - public_training_data=False, # couldn't find - public_training_code=False, + public_training_code=None, ) diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py index a96604446..a192fa134 100644 --- a/scripts/generate_metadata.py +++ b/scripts/generate_metadata.py @@ -220,7 +220,6 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: license=card_data.get("license", None), framework=frameworks, n_parameters=n_parameters, - public_training_data=bool(datasets), adapted_from=get_base_model(model_name), training_datasets=training_datasets, open_weights=True, @@ -237,6 +236,16 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: revision=None, languages=None, release_date=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + frameworks=[], ) diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index c540bb41e..fb0cf6cf5 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -6,8 +6,7 @@ from sentence_transformers import CrossEncoder, SentenceTransformer -from mteb import MTEB -from mteb.model_meta import ModelMeta +from mteb import MTEB, ModelMeta logging.basicConfig(level=logging.INFO) @@ -373,7 +372,18 @@ def test_reranker_same_ndcg1(): open_weights=True, revision=ce_revision, release_date="2021-04-15", + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + public_training_code=None, + reference=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=["Sentence Transformers", "PyTorch"], ) + eval = MTEB(tasks=["SciFact"]) eval.run( de, From 46f6abc795a7443fc95519205c44c088e634b6fe Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 20 Jan 2025 06:08:50 +0000 Subject: [PATCH 32/49] 1.29.10 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6ec3f11e2..1404925f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.9" +version = "1.29.10" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 77f7c839e5ae6cc92c643719f3eb75ded27f9649 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Mon, 20 Jan 2025 16:07:11 +0300 Subject: [PATCH 33/49] fix merge --- mteb/models/arctic_models.py | 25 +++++ mteb/models/bge_models.py | 1 - mteb/models/colbert_models.py | 4 +- mteb/models/e5_models.py | 2 +- mteb/models/salesforce_models.py | 27 ----- mteb/models/sentence_transformers_models.py | 111 +------------------- 6 files changed, 31 insertions(+), 139 deletions(-) diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index 1d99e31f4..b7217d1ef 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -115,6 +115,31 @@ primaryClass={cs.LG}, url={https://arxiv.org/abs/2407.18887}, }""", + public_training_code=None, + training_datasets={ + # source: https://arxiv.org/pdf/2405.05374 + # splits not specified to assuming everything + # in MTEB + "NQ": ["test"], + "NQHardNegatives": ["test"], + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], + # not in MTEB + # trained on stack exchange (title-body) + # "stackexchange": [], + # potentially means that: + # "StackExchangeClusteringP2P": ["test"], + # "StackExchangeClusteringP2P.v2": ["test"], + # "StackExchangeClustering": ["test"], + # "StackExchangeClustering.v2": ["test"], + # not in MTEB + # "paq": [], + # "s2orc": [], + # "other": [], # undisclosed including webdata + }, # also use synthetic ) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 795854806..79d220588 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -431,7 +431,6 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, citation=BGE_15_CITATION, - public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP public_training_code=None, # seemingly released (at least for some models, but the link is broken training_datasets=bge_training_data, ) diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index 0a8c0e4a5..f4baca358 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -161,7 +161,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: max_tokens=180, # Reduced for Benchmarking - see ColBERT paper embed_dim=None, # Bag of Embeddings (128) for each token license="mit", - similarity_fn_name="MaxSim", + similarity_fn_name="max_sim", framework=["PyLate", "ColBERT"], reference="https://huggingface.co/colbert-ir/colbertv2.0", use_instructions=False, @@ -213,7 +213,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: max_tokens=8192, embed_dim=None, # Bag of Embeddings (128) for each token license="cc-by-nc-4.0", - similarity_fn_name="MaxSim", + similarity_fn_name="max_sim", framework=["PyLate", "ColBERT"], reference="https://huggingface.co/jinaai/jina-colbert-v2", use_instructions=False, diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index 651ca8152..fe265f6f4 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -165,7 +165,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, citation=MULTILINGUAL_E5_CITATION, - public_training_code=False, # couldn't find + public_training_code=None, # couldn't find training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index a8968648f..fd5487166 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -61,33 +61,6 @@ def instruction_template( ) -SFR_Embedding_Mistral = ModelMeta( - loader=partial( # type: ignore - instruct_wrapper, - model_name_or_path="Salesforce/SFR-Embedding-Mistral", - instruction_template=instruction_template, - attn="cccc", - pooling_method="lasttoken", - mode="embedding", - torch_dtype="auto", - normalized=True, - ), - name="Salesforce/SFR-Embedding-Mistral", - languages=["eng_Latn"], - open_weights=True, - revision="938c560d1c236aa563b2dbdf084f28ab28bccb11", - release_date="2024-01-24", # initial commit of hf model. - n_parameters=7_110_000_000, - embed_dim=4096, - license="cc-by-nc-4.0", - max_tokens=32768, - reference="https://huggingface.co/Salesforce/SFR-Embedding-Mistral", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=True, -) - - SFR_Embedding_Mistral = ModelMeta( loader=partial( # type: ignore instruct_wrapper, diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 4f500fd51..63be6e925 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -272,8 +272,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=False, # does sentence transformer count? - public_training_data=True, + public_training_code=None, # does sentence transformer count? training_datasets={ # source: frontmatter in readme # trained on stack exchange, unsure if sources match @@ -309,112 +308,6 @@ citation=SBERT_CITATION, ) -jina_embeddings_v2_base_en = ModelMeta( - name="jinaai/jina-embeddings-v2-base-en", - languages=["eng-Latn"], - open_weights=True, - revision="6e85f575bc273f1fd840a658067d0157933c83f0", - release_date="2023-09-27", - n_parameters=137_000_000, - embed_dim=768, - license="apache-2.0", - max_tokens=8192, - reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-en", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets={"allenai/c4": ["train"]}, -) - -jina_embeddings_v2_small_en = ModelMeta( - name="jinaai/jina-embeddings-v2-small-en", - languages=["eng-Latn"], - open_weights=True, - revision="796cff318cdd4e5fbe8b7303a1ef8cbec36996ef", - release_date="2023-09-27", - n_parameters=32_700_000, - embed_dim=512, - license="apache-2.0", - max_tokens=8192, - reference="https://huggingface.co/jinaai/jina-embeddings-v2-small-en", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, -) - -jina_embedding_b_en_v1 = ModelMeta( - name="jinaai/jina-embedding-b-en-v1", - languages=["eng-Latn"], - open_weights=True, - revision="aa0645035294a8c0607ce5bb700aba982cdff32c", - release_date="2023-07-07", - n_parameters=110_000_000, - embed_dim=768, - license="apache-2.0", - max_tokens=512, - reference="https://huggingface.co/jinaai/jina-embedding-b-en-v1", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by="jinaai/jina-embeddings-v2-base-en", - adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, -) - -jina_embedding_s_en_v1 = ModelMeta( - name="jinaai/jina-embedding-s-en-v1", - languages=["eng-Latn"], - open_weights=True, - revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", - release_date="2023-07-07", - n_parameters=35_000_000, - embed_dim=512, - license="apache-2.0", - max_tokens=512, - reference="https://huggingface.co/jinaai/jina-embedding-s-en-v1", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by="jinaai/jina-embeddings-v2-small-en", - adapted_from=None, - training_datasets={"jinaai/negation-dataset": ["train"]}, -) - -all_MiniLM_L12_v2 = ModelMeta( - name="sentence-transformers/all-MiniLM-L12-v2", - languages=["eng-Latn"], - open_weights=True, - revision="364dd28d28dcd3359b537f3cf1f5348ba679da62", - release_date="2021-08-30", - n_parameters=33_400_000, - embed_dim=384, - license="apache-2.0", - max_tokens=256, - reference="https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2", - similarity_fn_name="cosine", - framework=["Sentence Transformers", "PyTorch"], - use_instructions=False, - superseded_by=None, - adapted_from=None, - citation="""@misc{feng2022languageagnosticbertsentenceembedding, - title={Language-agnostic BERT Sentence Embedding}, - author={Fangxiaoyu Feng and Yinfei Yang and Daniel Cer and Naveen Arivazhagan and Wei Wang}, - year={2022}, - eprint={2007.01852}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2007.01852}, - }""", - training_datasets=sent_trf_training_dataset, - public_training_code=None, -) - - contriever = ModelMeta( loader=partial( SentenceTransformerWrapper, @@ -443,6 +336,8 @@ url = {https://arxiv.org/abs/2112.09118}, doi = {10.48550/ARXIV.2112.09118}, }""", + public_training_code=None, + training_datasets=None, ) microllama_text_embedding = ModelMeta( From a7a8144a6964641614c7d407e43c75ab5b7c40ca Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Tue, 21 Jan 2025 11:38:42 +0100 Subject: [PATCH 34/49] fix: Add reported annotation and re-added public_training_data (#1846) * fix: Add additional dataset annotations * fix: readded public training data * update voyage annotations --- mteb/model_meta.py | 4 +- mteb/models/arctic_models.py | 22 +++-- mteb/models/bge_models.py | 16 ++- mteb/models/bm25.py | 1 + mteb/models/cohere_models.py | 12 ++- mteb/models/colbert_models.py | 2 + mteb/models/e5_instruct.py | 12 +++ mteb/models/e5_models.py | 25 ++++- mteb/models/google_models.py | 9 +- mteb/models/gritlm_models.py | 18 +++- mteb/models/gte_models.py | 9 +- mteb/models/ibm_granite_models.py | 4 + mteb/models/inf_models.py | 1 + mteb/models/jasper_models.py | 1 + mteb/models/jina_models.py | 5 + mteb/models/linq_models.py | 1 + mteb/models/llm2vec_models.py | 8 ++ mteb/models/misc_models.py | 69 +++++++++++++ mteb/models/model2vec_models.py | 9 +- mteb/models/moka_models.py | 9 +- mteb/models/mxbai_models.py | 1 + mteb/models/no_instruct_sentence_models.py | 1 + mteb/models/nomic_models.py | 5 + mteb/models/nvidia_models.py | 2 + mteb/models/openai_models.py | 9 +- mteb/models/overview.py | 4 + mteb/models/piccolo_models.py | 2 + mteb/models/promptriever_models.py | 4 + mteb/models/repllama_models.py | 2 + mteb/models/rerankers_custom.py | 3 + mteb/models/rerankers_monot5_based.py | 14 +++ mteb/models/ru_sentence_models.py | 13 +++ mteb/models/salesforce_models.py | 39 ++++---- mteb/models/sentence_transformers_models.py | 8 ++ mteb/models/stella_models.py | 6 ++ mteb/models/text2vec_models.py | 9 +- mteb/models/uae_models.py | 1 + mteb/models/voyage_models.py | 104 ++++++++++++++++++-- scripts/generate_metadata.py | 1 + tests/test_tasks/test_mteb_rerank.py | 1 + 40 files changed, 402 insertions(+), 64 deletions(-) diff --git a/mteb/model_meta.py b/mteb/model_meta.py index b105f301b..c88326edc 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -66,7 +66,8 @@ class ModelMeta(BaseModel): release_date: The date the model's revision was released. license: The license under which the model is released. Required if open_weights is True. open_weights: Whether the model is open source or proprietary. - public_training_code: Whether the code used to train the model is publicly available. + public_training_code: A link to the publicly available training code. If none it is assumed that the training code is not publicly available. + public_training_data: A link to the publicly available training data. If none it is assumed that the training data is not publicly available. similarity_fn_name: The distance metric used by the model. framework: The framework the model is implemented in, can be a list of frameworks e.g. `["Sentence Transformers", "PyTorch"]`. reference: A URL to the model's page on huggingface or another source. @@ -94,6 +95,7 @@ class ModelMeta(BaseModel): license: str | None open_weights: bool | None public_training_code: str | None + public_training_data: str | None framework: list[FRAMEWORKS] reference: STR_URL | None = None similarity_fn_name: DISTANCE_METRICS | None diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index 66822d41b..f765b01bf 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -102,7 +102,8 @@ use_instructions=True, adapted_from="sentence-transformers/all-MiniLM-L6-v2", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -151,7 +152,8 @@ use_instructions=True, adapted_from="intfloat/e5-small-unsupervised", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -200,7 +202,8 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v1.5", - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -249,7 +252,8 @@ use_instructions=True, adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -298,7 +302,8 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0", - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -350,6 +355,7 @@ adapted_from=None, superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -375,7 +381,8 @@ use_instructions=True, adapted_from="Alibaba-NLP/gte-multilingual-base", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -423,7 +430,8 @@ use_instructions=True, adapted_from="BAAI/bge-m3-retromae", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index d8270c573..d9eb64246 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -372,7 +372,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data="https://data.baai.ac.cn/details/BAAI-MTP", training_datasets=bge_training_data, ) @@ -397,6 +398,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_data="https://data.baai.ac.cn/details/BAAI-MTP", training_datasets=bge_training_data, ) @@ -421,6 +423,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_data="https://data.baai.ac.cn/details/BAAI-MTP", training_datasets=bge_training_data, ) @@ -444,7 +447,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data=None, training_datasets=bge_chinese_training_data, ) @@ -468,7 +472,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data=None, training_datasets=bge_chinese_training_data, ) @@ -492,7 +497,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data=None, training_datasets=bge_chinese_training_data, ) @@ -516,6 +522,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=bgem3_training_data, ) @@ -549,5 +556,6 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # not disclosed ) diff --git a/mteb/models/bm25.py b/mteb/models/bm25.py index ea56fd432..6e3d3747d 100644 --- a/mteb/models/bm25.py +++ b/mteb/models/bm25.py @@ -139,5 +139,6 @@ def encode(self, texts: list[str], **kwargs): framework=[], use_instructions=False, public_training_code="https://github.com/xhluca/bm25s", + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py index 8718a2e2a..60ff63ee8 100644 --- a/mteb/models/cohere_models.py +++ b/mteb/models/cohere_models.py @@ -234,7 +234,8 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -257,7 +258,8 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -280,7 +282,8 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -303,6 +306,7 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index 87b5fdb93..89b09de28 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -153,6 +153,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: open_weights=True, revision="c1e84128e85ef755c096a95bdb06b47793b13acf", public_training_code=None, + public_training_data=None, release_date="2024-09-21", n_parameters=110 * 1e6, max_tokens=180, # Reduced for Benchmarking - see ColBERT paper @@ -205,6 +206,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: open_weights=True, revision="4cf816e5e2b03167b132a3c847a9ecd48ba708e1", public_training_code=None, + public_training_data=None, release_date="2024-08-16", n_parameters=559 * 1e6, max_tokens=8192, diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index f4d590935..c89b64fc7 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -15,6 +15,16 @@ E5_INSTRUCTION = "Instruct: {instruction}\nQuery: " +E5_MISTRAL_TRAINING_DATA = { + **E5_TRAINING_DATA, + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + e5_instruct = ModelMeta( loader=partial( # type: ignore instruct_wrapper, @@ -40,6 +50,7 @@ license="mit", max_tokens=514, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -70,5 +81,6 @@ license="mit", max_tokens=32768, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index ace25ca08..0ad15e732 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -126,6 +126,16 @@ "NQ-PL": ["train"], # translation not trained on } +ME5_TRAINING_DATA = { + **E5_TRAINING_DATA, + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + e5_mult_small = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, @@ -147,7 +157,8 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, - training_datasets=E5_TRAINING_DATA, + public_training_data=None, + training_datasets=ME5_TRAINING_DATA, ) e5_mult_base = ModelMeta( @@ -170,7 +181,8 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, - training_datasets=E5_TRAINING_DATA, + public_training_data=None, + training_datasets=ME5_TRAINING_DATA, ) e5_mult_large = ModelMeta( @@ -194,7 +206,8 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, - training_datasets=E5_TRAINING_DATA, + public_training_data=None, + training_datasets=ME5_TRAINING_DATA, ) e5_eng_small_v2 = ModelMeta( @@ -217,6 +230,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -241,6 +255,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -267,6 +282,7 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -293,6 +309,7 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -319,6 +336,7 @@ superseded_by="intfloat/e5-large-v2", adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -345,5 +363,6 @@ superseded_by="intfloat/e5-base-v2", adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) diff --git a/mteb/models/google_models.py b/mteb/models/google_models.py index 08065f7af..40d316fee 100644 --- a/mteb/models/google_models.py +++ b/mteb/models/google_models.py @@ -151,7 +151,8 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -173,7 +174,8 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -195,6 +197,7 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index a68502b06..d15c1f4a5 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -11,6 +11,18 @@ logger = logging.getLogger(__name__) +GRIT_LM_TRAINING_DATA = { + **E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + # also uses medi2 which contains fever and hotpotqa: + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + + def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: return ( "<|user|>\n" + instruction + "\n<|embed|>\n" if instruction else "<|embed|>\n" @@ -38,9 +50,10 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, - training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + training_datasets=GRIT_LM_TRAINING_DATA, # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data public_training_code="https://github.com/ContextualAI/gritlm", + public_training_data=None, ) gritlm8x7b = ModelMeta( loader=partial( # type: ignore @@ -63,7 +76,8 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, - training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + training_datasets=GRIT_LM_TRAINING_DATA, # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data public_training_code="https://github.com/ContextualAI/gritlm", + public_training_data=None, ) diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index da265e79c..4de4b610f 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -46,6 +46,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, max_tokens=131072, ) @@ -77,6 +78,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -107,6 +109,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -130,6 +133,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # Not disclosed ) @@ -153,6 +157,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # Not disclosed ) @@ -176,6 +181,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # Not disclosed ) @@ -291,6 +297,7 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets=gte_multi_training_data, ) diff --git a/mteb/models/ibm_granite_models.py b/mteb/models/ibm_granite_models.py index 78bad6097..63679879c 100644 --- a/mteb/models/ibm_granite_models.py +++ b/mteb/models/ibm_granite_models.py @@ -42,6 +42,7 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, training_datasets=None, ) @@ -67,6 +68,7 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, training_datasets=None, ) @@ -92,6 +94,7 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, training_datasets=None, ) @@ -117,6 +120,7 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, training_datasets=None, ) diff --git a/mteb/models/inf_models.py b/mteb/models/inf_models.py index dc31adccd..0d40ff3ef 100644 --- a/mteb/models/inf_models.py +++ b/mteb/models/inf_models.py @@ -26,5 +26,6 @@ use_instructions=True, adapted_from="Alibaba-NLP/gte-Qwen2-7B-instruct", public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index 1dc06d564..dbd1615ad 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -93,4 +93,5 @@ def encode( training_datasets=nvidia_training_datasets, # "In jasper model the teacher model is nvidia/NV-Embed-v2", source https://huggingface.co/infgrad/jasper_en_vision_language_v1 # "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index 265d51237..4f1b58a35 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -224,6 +224,7 @@ def encode( reference="https://huggingface.co/jinaai/jina-embeddings-v3", training_datasets=None, public_training_code=None, + public_training_data=None, ) @@ -245,6 +246,7 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) jina_embeddings_v2_small_en = ModelMeta( @@ -265,6 +267,7 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) jina_embedding_b_en_v1 = ModelMeta( @@ -285,6 +288,7 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) jina_embedding_s_en_v1 = ModelMeta( @@ -305,4 +309,5 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/linq_models.py b/mteb/models/linq_models.py index 11cfa74ed..ead10ebf7 100644 --- a/mteb/models/linq_models.py +++ b/mteb/models/linq_models.py @@ -40,5 +40,6 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/llm2vec_models.py b/mteb/models/llm2vec_models.py index a5f1a69a3..28197e5c8 100644 --- a/mteb/models/llm2vec_models.py +++ b/mteb/models/llm2vec_models.py @@ -126,6 +126,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_llama3_8b_unsupervised = ModelMeta( @@ -151,6 +152,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, ) @@ -177,6 +179,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_mistral7b_unsupervised = ModelMeta( @@ -202,6 +205,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, ) llm2vec_llama2_7b_supervised = ModelMeta( @@ -227,6 +231,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_llama2_7b_unsupervised = ModelMeta( @@ -252,6 +257,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, ) llm2vec_sheared_llama_supervised = ModelMeta( @@ -277,6 +283,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_sheared_llama_unsupervised = ModelMeta( @@ -302,4 +309,5 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, ) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 5233ecec6..ba6e3e816 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -22,6 +22,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/Haon-Chen/speed-embedding-7b-instruct", similarity_fn_name="cosine", @@ -42,6 +43,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Gameselo/STS-multilingual-mpnet-base-v2", similarity_fn_name="cosine", @@ -62,6 +64,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", similarity_fn_name="cosine", @@ -82,6 +85,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-v1", similarity_fn_name="cosine", @@ -102,6 +106,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/Hum-Works/lodestone-base-4096-v1", similarity_fn_name="cosine", @@ -164,6 +169,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Jaume/gemma-2b-embeddings", similarity_fn_name="cosine", @@ -184,6 +190,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/BeastyZ/e5-R-mistral-7b", similarity_fn_name="cosine", @@ -210,6 +217,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-base", similarity_fn_name="cosine", @@ -235,6 +243,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-large", similarity_fn_name="cosine", @@ -260,6 +269,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-small", similarity_fn_name="cosine", @@ -280,6 +290,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Bulbasaur", similarity_fn_name="cosine", @@ -301,6 +312,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Ivysaur", similarity_fn_name="cosine", @@ -322,6 +334,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Squirtle", similarity_fn_name="cosine", @@ -343,6 +356,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Venusaur", similarity_fn_name="cosine", @@ -364,6 +378,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Wartortle", similarity_fn_name="cosine", @@ -385,6 +400,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/gte-micro", similarity_fn_name="cosine", @@ -405,6 +421,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/gte-micro-v4", similarity_fn_name="cosine", @@ -425,6 +442,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/OrdalieTech/Solon-embeddings-large-0.1", similarity_fn_name="cosine", @@ -445,6 +463,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", @@ -465,6 +484,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", similarity_fn_name="cosine", @@ -487,6 +507,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", @@ -509,6 +530,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", similarity_fn_name="cosine", @@ -531,6 +553,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", similarity_fn_name="cosine", @@ -553,6 +576,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", @@ -573,6 +597,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/consciousAI/cai-lunaris-text-embeddings", similarity_fn_name="cosine", @@ -593,6 +618,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/consciousAI/cai-stellaris-text-embeddings", similarity_fn_name="cosine", @@ -613,6 +639,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/bge-m3-custom-fr", similarity_fn_name="cosine", @@ -633,6 +660,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.2", similarity_fn_name="cosine", @@ -653,6 +681,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.3", similarity_fn_name="cosine", @@ -673,6 +702,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.4", similarity_fn_name="cosine", @@ -694,6 +724,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-base", similarity_fn_name="cosine", @@ -714,6 +745,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-large", similarity_fn_name="cosine", @@ -734,6 +766,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-small", similarity_fn_name="cosine", @@ -754,6 +787,7 @@ license="gpl-3.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/OrlikB/KartonBERT-USE-base-v1", similarity_fn_name="cosine", @@ -774,6 +808,7 @@ license="lgpl", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/OrlikB/st-polish-kartonberta-base-alpha-v1", similarity_fn_name="cosine", @@ -794,6 +829,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-base", similarity_fn_name="cosine", @@ -814,6 +850,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/dwzhu/e5-base-4k", similarity_fn_name="cosine", @@ -834,6 +871,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-large", similarity_fn_name="cosine", @@ -854,6 +892,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-small", similarity_fn_name="cosine", @@ -874,6 +913,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-roberta-base", similarity_fn_name="cosine", @@ -894,6 +934,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-roberta-large", similarity_fn_name="cosine", @@ -960,6 +1001,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-1b1", similarity_fn_name="cosine", @@ -1026,6 +1068,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-3b", similarity_fn_name="cosine", @@ -1092,6 +1135,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-560m", similarity_fn_name="cosine", @@ -1158,6 +1202,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-7b1", similarity_fn_name="cosine", @@ -1178,6 +1223,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-Embedding-v0", similarity_fn_name="cosine", @@ -1198,6 +1244,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-all-MiniLM-L6-v2", similarity_fn_name="cosine", @@ -1218,6 +1265,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-large-Embedding-v0", similarity_fn_name="cosine", @@ -1238,6 +1286,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-small-Embedding-v0", similarity_fn_name="cosine", @@ -1258,6 +1307,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/bigscience/sgpt-bloom-7b1-msmarco", similarity_fn_name="cosine", @@ -1278,6 +1328,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/aari1995/German_Semantic_STS_V2", similarity_fn_name="cosine", @@ -1299,6 +1350,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/abhinand/MedEmbed-small-v0.1", similarity_fn_name="cosine", @@ -1325,6 +1377,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/NoInstruct-small-Embedding-v0", similarity_fn_name="cosine", @@ -1345,6 +1398,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/brahmairesearch/slx-v0.1", similarity_fn_name="cosine", @@ -1365,6 +1419,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/deepfile/embedder-100p", similarity_fn_name="cosine", @@ -1385,6 +1440,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/deepvk/USER-bge-m3", similarity_fn_name="cosine", @@ -1416,6 +1472,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/infgrad/stella-base-en-v2", similarity_fn_name="cosine", @@ -1436,6 +1493,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/malenia1/ternary-weight-embedding", similarity_fn_name="cosine", @@ -1456,6 +1514,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/omarelshehy/arabic-english-sts-matryoshka", similarity_fn_name="cosine", @@ -1486,6 +1545,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/openbmb/MiniCPM-Embedding", similarity_fn_name="cosine", @@ -1516,6 +1576,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/shibing624/text2vec-base-multilingual", similarity_fn_name="cosine", @@ -1537,6 +1598,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/silma-ai/silma-embeddding-matryoshka-v0.1", similarity_fn_name="cosine", @@ -1557,6 +1619,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", similarity_fn_name="cosine", @@ -1577,6 +1640,7 @@ license="apache-2", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/DMetaSoul/sbert-chinese-general-v1", similarity_fn_name="cosine", @@ -1601,6 +1665,7 @@ license="apache-2", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/DMetaSoul/Dmeta-embedding-zh-small/", similarity_fn_name="cosine", @@ -1620,6 +1685,7 @@ license="not specified", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/lier007/xiaobu-embedding", similarity_fn_name="cosine", @@ -1640,6 +1706,7 @@ license="not specified", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/lier007/xiaobu-embedding-v2", similarity_fn_name="cosine", @@ -1660,6 +1727,7 @@ license="not specified", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Classical/Yinka", similarity_fn_name="cosine", @@ -1680,6 +1748,7 @@ license="cc-by-nc-4.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Classical/Yinka", similarity_fn_name="cosine", diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index afbf9df62..33da211c7 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -75,7 +75,8 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code="https://github.com/MinishLab/model2vec", # + public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) @@ -101,6 +102,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) m2v_base_output = ModelMeta( @@ -125,6 +127,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) m2v_multilingual_output = ModelMeta( @@ -149,6 +152,7 @@ def encode( superseded_by=None, training_datasets=None, public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) potion_base_2m = ModelMeta( @@ -173,6 +177,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) potion_base_4m = ModelMeta( @@ -197,6 +202,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) potion_base_8m = ModelMeta( @@ -221,4 +227,5 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) diff --git a/mteb/models/moka_models.py b/mteb/models/moka_models.py index d3943d78d..1504b4078 100644 --- a/mteb/models/moka_models.py +++ b/mteb/models/moka_models.py @@ -96,7 +96,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Not published + public_training_code=None, + public_training_data=None, # Not published training_datasets=m3e_dataset, ) @@ -117,7 +118,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Not published + public_training_code=None, + public_training_data=None, # Not published training_datasets=m3e_dataset, ) @@ -139,6 +141,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Not published + public_training_code=None, + public_training_data=None, # Not published training_datasets=m3e_dataset, ) diff --git a/mteb/models/mxbai_models.py b/mteb/models/mxbai_models.py index 04978a190..921db1787 100644 --- a/mteb/models/mxbai_models.py +++ b/mteb/models/mxbai_models.py @@ -27,5 +27,6 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/no_instruct_sentence_models.py b/mteb/models/no_instruct_sentence_models.py index a0596b9bd..9ff5cf901 100644 --- a/mteb/models/no_instruct_sentence_models.py +++ b/mteb/models/no_instruct_sentence_models.py @@ -100,5 +100,6 @@ def encode( # type: ignore adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/nomic_models.py b/mteb/models/nomic_models.py index 5d9da7b59..772d92902 100644 --- a/mteb/models/nomic_models.py +++ b/mteb/models/nomic_models.py @@ -127,6 +127,7 @@ def encode( # type: ignore adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -154,6 +155,7 @@ def encode( # type: ignore adapted_from=None, superseded_by="nomic-ai/nomic-embed-text-v1.5", public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -181,6 +183,7 @@ def encode( # type: ignore adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -209,6 +212,7 @@ def encode( # type: ignore adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -238,5 +242,6 @@ def encode( # type: ignore adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 1f345a62b..1997a8527 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -141,6 +141,7 @@ def encode( use_instructions=True, training_datasets=nvidia_training_datasets, public_training_code=None, + public_training_data=None, ) NV_embed_v1 = ModelMeta( @@ -164,4 +165,5 @@ def encode( use_instructions=True, training_datasets=nvidia_training_datasets, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 863c9d782..079e7c936 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -135,7 +135,8 @@ def _to_numpy(self, embedding_response) -> np.ndarray: similarity_fn_name="cosine", framework=["API"], use_instructions=False, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) text_embedding_3_large = ModelMeta( @@ -156,7 +157,8 @@ def _to_numpy(self, embedding_response) -> np.ndarray: framework=["API"], use_instructions=False, n_parameters=None, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, license=None, similarity_fn_name=None, @@ -179,7 +181,8 @@ def _to_numpy(self, embedding_response) -> np.ndarray: framework=["API"], use_instructions=False, n_parameters=None, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, license=None, similarity_fn_name=None, diff --git a/mteb/models/overview.py b/mteb/models/overview.py index ad93efb31..e444b1105 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -226,6 +226,7 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: embed_dim=None, open_weights=True, public_training_code=None, + public_training_data=None, use_instructions=None, ) except Exception as e: @@ -241,6 +242,7 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: license=None, open_weights=True, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -273,6 +275,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe license=None, open_weights=True, public_training_code=None, + public_training_data=None, use_instructions=None, training_datasets=None, ) @@ -291,6 +294,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe license=None, open_weights=True, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, diff --git a/mteb/models/piccolo_models.py b/mteb/models/piccolo_models.py index bb92b5567..d51487b8b 100644 --- a/mteb/models/piccolo_models.py +++ b/mteb/models/piccolo_models.py @@ -21,6 +21,7 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=None, # They don't specify ) @@ -42,5 +43,6 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=None, # They don't say ) diff --git a/mteb/models/promptriever_models.py b/mteb/models/promptriever_models.py index a7066817a..287fd3ef9 100644 --- a/mteb/models/promptriever_models.py +++ b/mteb/models/promptriever_models.py @@ -65,6 +65,7 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, public_training_code=None, + public_training_data=None, ) promptriever_llama3 = ModelMeta( @@ -90,6 +91,7 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, public_training_code=None, + public_training_data=None, ) @@ -116,6 +118,7 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, public_training_code=None, + public_training_data=None, ) promptriever_mistral_v1 = ModelMeta( @@ -141,4 +144,5 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/repllama_models.py b/mteb/models/repllama_models.py index 5ae4c0d8c..2c5ef6e44 100644 --- a/mteb/models/repllama_models.py +++ b/mteb/models/repllama_models.py @@ -150,6 +150,7 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, public_training_code=None, + public_training_data=None, ) @@ -176,5 +177,6 @@ def loader_inner(**kwargs: Any) -> Encoder: framework=["PyTorch", "Tevatron"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/rerankers_custom.py b/mteb/models/rerankers_custom.py index 5609fdf83..1a0fd1f6b 100644 --- a/mteb/models/rerankers_custom.py +++ b/mteb/models/rerankers_custom.py @@ -209,6 +209,7 @@ def loader_inner(**kwargs: Any) -> Encoder: embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -233,6 +234,7 @@ def loader_inner(**kwargs: Any) -> Encoder: embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -289,6 +291,7 @@ def loader_inner(**kwargs: Any) -> Encoder: embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, diff --git a/mteb/models/rerankers_monot5_based.py b/mteb/models/rerankers_monot5_based.py index 5bc50bad7..c53b36400 100644 --- a/mteb/models/rerankers_monot5_based.py +++ b/mteb/models/rerankers_monot5_based.py @@ -301,6 +301,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -324,6 +325,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -347,6 +349,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -370,6 +373,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -405,6 +409,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -439,6 +444,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -473,6 +479,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -507,6 +514,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -530,6 +538,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -553,6 +562,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -576,6 +586,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -600,6 +611,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -728,6 +740,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -750,6 +763,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index d8c7e8451..297c7f314 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -23,6 +23,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -41,6 +42,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -59,6 +61,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -77,6 +80,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -124,6 +128,7 @@ # "CarlBrendt/Summ_Dialog_News": ["train"], }, public_training_code=None, + public_training_data=None, ) deberta_v1_ru = ModelMeta( @@ -141,6 +146,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -159,6 +165,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -177,6 +184,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -195,6 +203,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -213,6 +222,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -231,6 +241,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # source model in unknown # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, ) @@ -252,6 +263,7 @@ training_datasets=None, # source model in unknown # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, public_training_code=None, + public_training_data=None, ) @@ -279,6 +291,7 @@ license="mit", similarity_fn_name="cosine", public_training_code=None, + public_training_data=None, training_datasets=None, framework=["Sentence Transformers", "PyTorch"], ) diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index 4d4a60b62..235057a6f 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -6,6 +6,8 @@ from mteb.model_meta import ModelMeta from mteb.models.instruct_wrapper import instruct_wrapper +from .e5_instruct import E5_MISTRAL_TRAINING_DATA + def instruction_template( instruction: str, prompt_type: PromptType | None = None @@ -13,6 +15,19 @@ def instruction_template( return f"Instruct: {instruction}\nQuery: " if instruction else "" +SFR_TRAINING_DATA = { # inherits from e5 + **E5_MISTRAL_TRAINING_DATA, + # From previously released blogpost which now have been taken down: + "FiQA2018": ["train"], + "FiQA2018-PL": ["train"], + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + SFR_Embedding_2_R = ModelMeta( loader=partial( # type: ignore instruct_wrapper, @@ -41,16 +56,8 @@ def instruction_template( use_instructions=True, adapted_from="intfloat/e5-mistral-7b-instruct", public_training_code=None, - training_datasets={ # inherits from e5 - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - }, + public_training_data=None, + training_datasets=SFR_TRAINING_DATA, ) @@ -79,14 +86,6 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, - training_datasets={ # inherits from e5 - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - }, + public_training_data=None, + training_datasets=SFR_TRAINING_DATA, ) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index fa48ae7cc..eec65049d 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -112,6 +112,7 @@ adapted_from=None, training_datasets=sent_trf_training_dataset, public_training_code=None, + public_training_data=None, ) all_MiniLM_L12_v2 = ModelMeta( @@ -132,6 +133,7 @@ adapted_from=None, training_datasets=sent_trf_training_dataset, public_training_code=None, + public_training_data=None, ) paraphrase_multilingual_MiniLM_L12_v2 = ModelMeta( @@ -152,6 +154,7 @@ adapted_from=None, training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) public_training_code=None, + public_training_data=None, ) paraphrase_multilingual_mpnet_base_v2 = ModelMeta( @@ -183,6 +186,7 @@ # "yahoo-answers": yahoo_answers_train_dataset, # "stack-exchange": stack_exchange_train_dataset, public_training_code=None, + public_training_data=None, ) labse = ModelMeta( @@ -203,6 +207,7 @@ adapted_from=None, training_datasets=None, # scraped and mined webdata including CC, wiki, see section 3.1 https://aclanthology.org/2022.acl-long.62.pdf public_training_code="https://www.kaggle.com/models/google/labse/tensorFlow2/labse/2?tfhub-redirect=true", + public_training_data=None, ) multi_qa_MiniLM_L6_cos_v1 = ModelMeta( @@ -223,6 +228,7 @@ adapted_from="nreimers/MiniLM-L6-H384-uncased", training_datasets=sent_trf_training_dataset, # assumed public_training_code=None, + public_training_data=None, ) all_mpnet_base_v2 = ModelMeta( @@ -243,6 +249,7 @@ adapted_from=None, training_datasets=sent_trf_training_dataset, public_training_code=None, + public_training_data=None, ) @@ -274,4 +281,5 @@ # "sentence-transformers/natural-questions": ["train"], }, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 44aa1f860..7210b287c 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -30,6 +30,7 @@ reference="https://huggingface.co/dunzhang/stella_en_400M_v5", training_datasets=None, public_training_code=None, + public_training_data=None, ) stella_en_1_5b = ModelMeta( @@ -56,6 +57,7 @@ reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", training_datasets=None, public_training_code=None, + public_training_data=None, ) stella_large_zh_v3_1792d = ModelMeta( @@ -75,6 +77,7 @@ superseded_by="dunzhang/stella-mrl-large-zh-v3.5-1792d", adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets={ # Not in MTEB: # - infgrad/dialogue_rewrite_llm @@ -99,6 +102,7 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets={ # Not in MTEB: # - infgrad/dialogue_rewrite_llm @@ -124,6 +128,7 @@ superseded_by=None, adapted_from="dunzhang/stella-large-zh-v3-1792d", public_training_code=None, + public_training_data=None, training_datasets=None, # Not specified ) @@ -144,6 +149,7 @@ superseded_by=None, adapted_from="dunzhang/stella-mrl-large-zh-v3.5-1792d", public_training_code=None, + public_training_data=None, training_datasets={ # It's a bit unclear what they have trained on to be honest, because they don't list all # And they also have some rather cryptic description of their training procedure, but at diff --git a/mteb/models/text2vec_models.py b/mteb/models/text2vec_models.py index 12322e69e..86a9bcca4 100644 --- a/mteb/models/text2vec_models.py +++ b/mteb/models/text2vec_models.py @@ -21,7 +21,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Couldn't find it + public_training_code=None, + public_training_data=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB @@ -46,7 +47,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Couldn't find it + public_training_code=None, + public_training_data=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB @@ -87,7 +89,8 @@ use_instructions=False, superseded_by=None, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", - public_training_code=None, # Couldn't find it + public_training_code=None, + public_training_data=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB diff --git a/mteb/models/uae_models.py b/mteb/models/uae_models.py index bd8be4869..8d97703ef 100644 --- a/mteb/models/uae_models.py +++ b/mteb/models/uae_models.py @@ -83,4 +83,5 @@ def encode( "SNLI": [], }, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index a98bc041b..a637dee36 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -12,6 +12,11 @@ from .wrapper import Wrapper +VOYAGE_TRAINING_DATA = { + # Self-reported (message from VoyageAI member) + # synthetic data +} + def token_limit(max_tpm: int, interval: int = 60): limit_interval_start_ts = time.time() @@ -156,8 +161,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_finance_2 = ModelMeta( @@ -179,8 +185,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_law_2 = ModelMeta( @@ -202,8 +209,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_code_2 = ModelMeta( @@ -225,8 +233,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_large_2 = ModelMeta( @@ -248,8 +257,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_2 = ModelMeta( @@ -271,8 +281,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_multilingual_2 = ModelMeta( name="voyageai/voyage-multilingual-2", @@ -293,8 +304,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_3 = ModelMeta( @@ -316,8 +328,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_3_lite = ModelMeta( @@ -339,6 +352,79 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, +) + + +voyage_3_exp = ModelMeta( + name="voyageai/voyage-3-m-exp", + revision="1", + release_date=None, # not released + languages=None, # supported languages not specified + loader=partial( + VoyageWrapper, + model_name="voyage-3-m-exp", + model_prompts=model_prompts, + ), + max_tokens=32000, + embed_dim=512, + open_weights=False, + n_parameters=None, + license=None, + reference="https://huggingface.co/voyageai/voyage-3-m-exp", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, + training_datasets={ + # MTEB(eng, classic) training data: + "ArguAna": ["train"], + "ArguAna-PL": ["train"], + "NanoArguAnaRetrieval": ["train"], + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FiQA2018": ["train"], + "FiQA2018-PL": ["train"], # translation not trained on + "STS12": ["train"], + "STS22": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "ImdbClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringS2S.v2": ["train"], + "BiorxivClusteringP2P": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwentyNewsgroupsClustering.v2": ["train"], + "STSBenchmark": ["train"], + "STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on + }, public_training_code=None, + public_training_data=None, ) diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py index a192fa134..4ae87fdbc 100644 --- a/scripts/generate_metadata.py +++ b/scripts/generate_metadata.py @@ -242,6 +242,7 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: license=None, open_weights=True, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index fb0cf6cf5..dc65dae90 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -377,6 +377,7 @@ def test_reranker_same_ndcg1(): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, reference=None, similarity_fn_name=None, use_instructions=None, From 2fac8ba1efc5b514071b7b68f5ea425a9acbf4b1 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 21 Jan 2025 10:54:16 +0000 Subject: [PATCH 35/49] 1.29.11 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1404925f4..a775a5fec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.10" +version = "1.29.11" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From a8cc88778623ee4e46c7c27ea5b5bc98e534165e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Tue, 21 Jan 2025 12:29:59 +0100 Subject: [PATCH 36/49] fix: Leaderboard Refinements (#1849) * Added better descriptions to benchmarks and removed beta tags * Fixed zero-shot filtering on app loading * Added zero-shot definition in an accordion * NaN values are now filled with blank * Added type hints to filter_models --- mteb/benchmarks/benchmarks.py | 69 +++++++++++----- mteb/leaderboard/app.py | 150 +++++++++++++++++++++------------- mteb/leaderboard/table.py | 8 +- 3 files changed, 145 insertions(+), 82 deletions(-) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 97dc6acb6..233c7a79b 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -71,7 +71,7 @@ def load_results( MTEB_EN = Benchmark( - name="MTEB(eng, beta)", + name="MTEB(eng)", tasks=MTEBTasks( get_tasks( tasks=[ @@ -128,7 +128,13 @@ def load_results( get_task("STS22.v2", eval_splits=["test"], hf_subsets=["en"]), ), ), - description="English benchmarks from MTEB", + description="""The new English Massive Text Embedding Benchmark. +This benchmark was created to account for the fact that many models have now been finetuned +to tasks in the original MTEB, and contains tasks that are not as frequently used for model training. +This way the new benchmark and leaderboard can give our users a more realistic expectation of models' generalization performance. + +The original MTEB leaderboard is available under the [MTEB(eng, classic)](http://mteb-leaderboard-2-demo.hf.space/?benchmark_name=MTEB%28eng%2C+classic%29) tab. + """, citation="", contacts=["KennethEnevoldsen", "Muennighoff"], ) @@ -216,7 +222,12 @@ def load_results( get_task("STS22", eval_splits=["test"], hf_subsets=["en"]), ) ), - description="The original English benchmark by Muennighoff et al., (2023).", + description="""The original English benchmark by Muennighoff et al., (2023). +This page is an adaptation of the [old MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard). + +> We recommend that you use [MTEB(eng)](http://mteb-leaderboard-2-demo.hf.space/?benchmark_name=MTEB%28eng%29) instead, +as many models have been tuned on MTEB(eng, classic) datasets, and MTEB(eng) might give a more accurate representation of models' generalization performance. + """, citation="""@inproceedings{muennighoff-etal-2023-mteb, title = "{MTEB}: Massive Text Embedding Benchmark", author = "Muennighoff, Niklas and @@ -275,7 +286,7 @@ def load_results( "STS22", ], ), - description="Main Russian benchmarks from MTEB", + description="A Russian version of the Massive Text Embedding Benchmark with a number of novel Russian tasks in all task categories of the original MTEB.", reference="https://aclanthology.org/2023.eacl-main.148/", citation="""@misc{snegirev2024russianfocusedembeddersexplorationrumteb, title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design}, @@ -324,8 +335,8 @@ def load_results( "LegalQuAD", ] ), - description="Legal benchmarks from MTEB.", - reference="https://aclanthology.org/2023.eacl-main.148/", + description="A benchmark of retrieval tasks in the legal domain.", + reference=None, citation=None, ) @@ -365,7 +376,10 @@ def load_results( "Tatoeba", ] ), - description="BitextMining benchmark from MINERS", + description="""Bitext Mining texts from the MINERS benchmark, a benchmark designed to evaluate the + ability of multilingual LMs in semantic retrieval tasks, + including bitext mining and classification via retrieval-augmented contexts. + """, reference="https://arxiv.org/pdf/2406.07424", citation=""" @article{winata2024miners, @@ -533,7 +547,7 @@ def load_results( ) + (get_task("STS22", eval_splits=["test"], hf_subsets=["fr"]),) ), - description="Main French benchmarks from MTEB", + description="MTEB-French, a French expansion of the original benchmark with high-quality native French datasets.", reference="https://arxiv.org/abs/2405.20468", citation="""@misc{ciancone2024mtebfrenchresourcesfrenchsentence, title={MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis}, @@ -581,7 +595,7 @@ def load_results( "STS22", ], ), - description="Main German benchmarks from MTEB", + description="A benchmark for text-embedding performance in German.", reference="https://arxiv.org/html/2401.02709v1", citation="""@misc{wehrli2024germantextembeddingclustering, title={German Text Embedding Clustering Benchmark}, @@ -613,7 +627,7 @@ def load_results( "KorSTS", ], ), - description="Main Korean benchmarks from MTEB", + description="A benchmark and leaderboard for evaluation of text embedding in Korean.", reference=None, citation=None, ) @@ -650,7 +664,11 @@ def load_results( ) + (get_task("STS22", eval_splits=["test"], hf_subsets=["pl"]),), ), - description="Main Polish benchmarks from MTEB", + description="""Polish Massive Text Embedding Benchmark (PL-MTEB), a comprehensive benchmark for text embeddings in Polish. The PL-MTEB consists of 28 diverse NLP +tasks from 5 task types. With tasks adapted based on previously used datasets by the Polish +NLP community. In addition, a new PLSC (Polish Library of Science Corpus) dataset was created +consisting of titles and abstracts of scientific publications in Polish, which was used as the basis for +two novel clustering tasks.""", # Rephrased from the abstract reference="https://arxiv.org/abs/2405.10138", citation="""@article{poswiata2024plmteb, title={PL-MTEB: Polish Massive Text Embedding Benchmark}, @@ -695,14 +713,14 @@ def load_results( "typescript", ], ), - description="Main code benchmarks from MTEB", + description="A massive code embedding benchmark covering retrieval tasks in a miriad of popular programming languages.", reference=None, citation=None, ) MTEB_multilingual = Benchmark( - name="MTEB(Multilingual, beta)", + name="MTEB(Multilingual)", tasks=get_tasks( tasks=[ "BornholmBitextMining", @@ -839,7 +857,7 @@ def load_results( "MIRACLRetrievalHardNegatives", ], ), - description="The Multilingual benchmarks from MMTEB. Currently under development.", + description="A large-scale multilingual expansion of MTEB, driven mainly by highly-curated community contributions covering 250+ languages.", reference=None, citation=None, contacts=["KennethEnevoldsen", "isaac-chung"], @@ -874,7 +892,7 @@ def load_results( "ESCIReranking", ], ), - description="Main Japanese benchmarks from MTEB", + description="JMTEB is a benchmark for evaluating Japanese text embedding models.", reference="https://github.com/sbintuitions/JMTEB", citation=None, ) @@ -914,7 +932,7 @@ def load_results( ] MTEB_INDIC = Benchmark( - name="MTEB(Indic, beta)", + name="MTEB(Indic)", tasks=get_tasks( tasks=[ # Bitext @@ -951,7 +969,7 @@ def load_results( languages=indic_languages, exclusive_language_filter=True, ), - description="Main Indic benchmark from MMTEB", + description="A regional geopolitical text embedding benchmark targetting embedding performance on Indic languages.", reference=None, citation=None, contacts=["KennethEnevoldsen", "isaac-chung"], @@ -1002,7 +1020,7 @@ def load_results( ] MTEB_EU = Benchmark( - name="MTEB(Europe, beta)", + name="MTEB(Europe)", tasks=get_tasks( tasks=[ "BornholmBitextMining", @@ -1083,7 +1101,7 @@ def load_results( languages=eu_languages, exclusive_language_filter=True, ), - description="Main European benchmark from MMTEB", + description="A regional geopolitical text embedding benchmark targetting embedding performance on European languages.", reference=None, citation=None, contacts=["KennethEnevoldsen", "isaac-chung"], @@ -1101,7 +1119,10 @@ def load_results( "LEMBWikimQARetrieval", ], ), - description="The main benchmark for evaluating long document retrieval.", + description="""LongEmbed is a benchmark oriented at exploring models' performance on long-context retrieval. + The benchmark comprises two synthetic tasks and four carefully chosen real-world tasks, + featuring documents of varying length and dispersed target information. + """, # Pieced together from paper abstract. reference="https://arxiv.org/abs/2404.12096v2", citation="""@article{zhu2024longembed, title={LongEmbed: Extending Embedding Models for Long Context Retrieval}, @@ -1116,7 +1137,13 @@ def load_results( tasks=get_tasks( tasks=["BrightRetrieval"], ), - description="A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.", + description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval. + BRIGHT is the first text retrieval + benchmark that requires intensive reasoning to retrieve relevant documents with + a dataset consisting of 1,384 real-world queries spanning diverse domains, such as + economics, psychology, mathematics, and coding. These queries are drawn from + naturally occurring and carefully curated human data. + """, reference="https://brightbenchmark.github.io/", citation="""@article{su2024bright, title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index cb806e467..f339c6142 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -6,6 +6,7 @@ import tempfile import time from pathlib import Path +from typing import Literal from urllib.parse import urlencode import gradio as gr @@ -48,9 +49,12 @@ def produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str: return md +DEFAULT_BENCHMARK_NAME = "MTEB(Multilingual)" + + def set_benchmark_on_load(request: gr.Request): query_params = request.query_params - return query_params.get("benchmark_name", "MTEB(Multilingual, beta)") + return query_params.get("benchmark_name", DEFAULT_BENCHMARK_NAME) def download_table(table: pd.DataFrame) -> Path: @@ -117,23 +121,75 @@ def update_task_info(task_names: str) -> gr.DataFrame: return gr.DataFrame(df, datatype=["markdown"] + ["str"] * (len(df.columns) - 1)) +# Model sizes in million parameters +MIN_MODEL_SIZE, MAX_MODEL_SIZE = 0, 10_000 + + +def filter_models( + model_names, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot_setting, +): + lower, upper = model_size + # Setting to None, when the user doesn't specify anything + if (lower == MIN_MODEL_SIZE) and (upper == MAX_MODEL_SIZE): + lower, upper = None, None + else: + # Multiplying by millions + lower = lower * 1e6 + upper = upper * 1e6 + model_metas = mteb.get_model_metas( + model_names=model_names, + open_weights=availability, + use_instructions=instructions, + frameworks=compatibility, + n_parameters_range=(lower, upper), + ) + tasks = mteb.get_tasks(tasks=task_select) + models_to_keep = set() + for model_meta in model_metas: + is_model_zero_shot = model_meta.is_zero_shot_on(tasks) + if is_model_zero_shot is None: + if zero_shot_setting == "hard": + continue + elif not is_model_zero_shot: + if zero_shot_setting != "off": + continue + models_to_keep.add(model_meta.name) + return list(models_to_keep) + + logger.info("Loading all benchmark results") all_results = load_results() -# Model sizes in million parameters -min_model_size, max_model_size = 0, 10_000 - benchmarks = mteb.get_benchmarks() all_benchmark_results = { benchmark.name: benchmark.load_results(base_results=all_results) for benchmark in benchmarks } -default_benchmark = mteb.get_benchmark("MTEB(Multilingual, beta)") +default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME) default_results = all_benchmark_results[default_benchmark.name] logger.info("Benchmark results loaded") default_scores = default_results.get_scores(format="long") -summary_table, per_task_table = scores_to_tables(default_scores) +all_models = list({entry["model_name"] for entry in default_scores}) +filtered_models = filter_models( + all_models, + default_results.task_names, + availability=None, + compatibility=[], + instructions=None, + model_size=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), + zero_shot_setting="soft", +) + +summary_table, per_task_table = scores_to_tables( + [entry for entry in default_scores if entry["model_name"] in filtered_models] +) benchmark_select = gr.Dropdown( [bench.name for bench in benchmarks], @@ -258,14 +314,14 @@ def update_task_info(task_names: str) -> gr.DataFrame: interactive=True, ) model_size = RangeSlider( - minimum=min_model_size, - maximum=max_model_size, - value=(min_model_size, max_model_size), + minimum=MIN_MODEL_SIZE, + maximum=MAX_MODEL_SIZE, + value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), label="Model Size (#M Parameters)", interactive=True, ) scores = gr.State(default_scores) - models = gr.State(list({entry["model_name"] for entry in default_scores})) + models = gr.State(filtered_models) with gr.Row(): with gr.Column(): description = gr.Markdown( @@ -295,6 +351,10 @@ def update_task_info(task_names: str) -> gr.DataFrame: """ ) summary_table.render() + download_summary = gr.DownloadButton("Download Table") + download_summary.click( + download_table, inputs=[summary_table], outputs=[download_summary] + ) with gr.Accordion( "What do aggregate measures (Rank(Borda), Mean(Task), etc.) mean?", open=False, @@ -308,10 +368,19 @@ def update_task_info(task_names: str) -> gr.DataFrame: **Mean(TaskType)**: This is a weighted average across different task categories, such as classification or retrieval. It is computed by first computing the average by task category and then computing the average on each category. Similar to the Mean(Task) this measure is continuous and tends to overvalue tasks with higher variance. This score also prefers models that perform well across all task categories. """ ) - download_summary = gr.DownloadButton("Download Table") - download_summary.click( - download_table, inputs=[summary_table], outputs=[download_summary] - ) + with gr.Accordion( + "What does zero-shot mean?", + open=False, + ): + gr.Markdown( + """ +A model is considered zero-shot if it is not trained on any splits of the datasets used to derive the tasks. +E.g., if a model is trained on Natural Questions, it cannot be considered zero-shot on benchmarks containing the task “NQ” which is derived from Natural Questions. +This definition creates a few edge cases. For instance, multiple models are typically trained on Wikipedia title and body pairs, but we do not define this as leakage on, e.g., “WikipediaRetrievalMultilingual” and “WikiClusteringP2P” as these datasets are not based on title-body pairs. +Distilled, further fine-tunes or in other ways, derivative models inherit the datasets of their parent models. +Based on community feedback and research findings, This definition could change in the future. + """ + ) with gr.Tab("Performance per task"): per_task_table.render() download_per_task = gr.DownloadButton("Download Table") @@ -405,51 +474,14 @@ def update_task_list(benchmark_name, type_select, domain_select, lang_select): outputs=[task_select], ) - def filter_models( - model_names, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot_setting, - ): - lower, upper = model_size - # Setting to None, when the user doesn't specify anything - if (lower == min_model_size) and (upper == max_model_size): - lower, upper = None, None - else: - # Multiplying by millions - lower = lower * 1e6 - upper = upper * 1e6 - model_metas = mteb.get_model_metas( - model_names=model_names, - open_weights=availability, - use_instructions=instructions, - frameworks=compatibility, - n_parameters_range=(lower, upper), - ) - tasks = mteb.get_tasks(tasks=task_select) - models_to_keep = set() - for model_meta in model_metas: - is_model_zero_shot = model_meta.is_zero_shot_on(tasks) - if is_model_zero_shot is None: - if zero_shot_setting == "hard": - continue - elif not is_model_zero_shot: - if zero_shot_setting != "off": - continue - models_to_keep.add(model_meta.name) - return list(models_to_keep) - def update_models( - scores, - tasks, - availability, - compatibility, - instructions, - model_size, - zero_shot, + scores: list[dict], + tasks: list[str], + availability: bool | None, + compatibility: list[str], + instructions: bool | None, + model_size: tuple[int, int], + zero_shot: Literal["hard", "soft", "off"], ): start_time = time.time() model_names = list({entry["model_name"] for entry in scores}) @@ -544,7 +576,7 @@ def update_models( ], outputs=[models], ) - zero_shot.input( + zero_shot.change( update_models, inputs=[ scores, diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 041df4709..2cb5fb34b 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -218,7 +218,11 @@ def scores_to_tables( joint_table[score_columns] = joint_table[score_columns].map(format_scores) joint_table_style = ( joint_table.style.format( - {**{column: "{:.2f}" for column in score_columns}, "Rank (Borda)": "{:.0f}"} + { + **{column: "{:.2f}" for column in score_columns}, + "Rank (Borda)": "{:.0f}", + }, + na_rep="", ) .highlight_min("Rank (Borda)", props="font-weight: bold") .highlight_max(subset=score_columns, props="font-weight: bold") @@ -226,7 +230,7 @@ def scores_to_tables( task_score_columns = per_task.select_dtypes("number").columns per_task[task_score_columns] *= 100 per_task_style = per_task.style.format( - "{:.2f}", subset=task_score_columns + "{:.2f}", subset=task_score_columns, na_rep="" ).highlight_max(subset=task_score_columns, props="font-weight: bold") return ( gr.DataFrame( From afd3c77f6b9f0684d1de9b26895e9961a04d6f3d Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 21 Jan 2025 11:37:04 +0000 Subject: [PATCH 37/49] 1.29.12 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a775a5fec..15fcb6978 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.11" +version = "1.29.12" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 6da8a13f58d01a9049201cab44b9add97aaf9955 Mon Sep 17 00:00:00 2001 From: Sam <40773225+sam-hey@users.noreply.github.com> Date: Tue, 21 Jan 2025 14:22:43 +0100 Subject: [PATCH 38/49] [v2] ci: run bm25 and ColBERT test in ci (#1829) * update install for tests * use tmp dir for tests * ref: use tmp_path for output_folder * ref: clean up tests * skip test for pylate python < 3.10 * fix: tests * fix: tests * fix: model meta CrossEncoder * test: model meta * update path test * lint * Update mteb/models/overview.py Co-authored-by: Roman Solomatin * use as_posix() * add more asserts & get embeding_dim for st * fix: MaxSim add test fix ModelMeta * fix: colbert test py 3.9 & add revision * ref: _get_model_meta --------- Co-authored-by: sam021313 <40773225+sam021313@users.noreply.github.com> Co-authored-by: Roman Solomatin --- Makefile | 2 +- mteb/evaluation/MTEB.py | 15 +++- mteb/model_meta.py | 4 +- mteb/models/__init__.py | 2 + mteb/models/colbert_models.py | 4 +- mteb/models/overview.py | 53 +++++++++++++- tests/test_benchmark/test_benchmark.py | 57 ++++++++------- ...est_benchmark_integration_with_datasets.py | 5 +- ...k_integration_with_sentencetransformers.py | 7 +- tests/test_benchmark/test_models.py | 13 +++- tests/test_cli.py | 17 +++-- tests/test_model_meta/test_model_meta.py | 73 +++++++++++++++++++ tests/test_reproducible_workflow.py | 7 +- tests/test_tasks/test_mteb_rerank.py | 28 +++---- 14 files changed, 221 insertions(+), 66 deletions(-) create mode 100644 tests/test_model_meta/test_model_meta.py diff --git a/Makefile b/Makefile index 3c68c9e0d..7d8ca4d74 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ install: install-for-tests: @echo "--- 🚀 Installing project dependencies for test ---" @echo "This ensures that the project is not installed in editable mode" - pip install ".[dev,speedtask]" + pip install ".[dev,speedtask,bm25s,pylate]" lint: @echo "--- 🧹 Running linters ---" diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index 3c94f2478..377f8b72e 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -20,7 +20,10 @@ from mteb.abstasks.AbsTask import ScoresDict from mteb.encoder_interface import Encoder from mteb.model_meta import ModelMeta -from mteb.models import model_meta_from_sentence_transformers +from mteb.models import ( + model_meta_from_cross_encoder, + model_meta_from_sentence_transformers, +) from ..abstasks.AbsTask import AbsTask from ..load_results.task_results import TaskResult @@ -495,7 +498,7 @@ def create_model_meta(model: Encoder) -> ModelMeta: meta = model.mteb_model_meta # type: ignore else: try: - meta = model_meta_from_sentence_transformers(model) # type: ignore + meta = MTEB._get_model_meta(model) except AttributeError: logger.warning( "Could not find model metadata. Please set the model.mteb_model_meta attribute or if you are using " @@ -597,3 +600,11 @@ def _get_missing_evaluations( missing_evaluations[split]["missing_subsets"] = missing_subsets return missing_evaluations + + @staticmethod + def _get_model_meta(model: Encoder) -> ModelMeta: + if isinstance(model, CrossEncoder): + meta = model_meta_from_cross_encoder(model) + else: + meta = model_meta_from_sentence_transformers(model) + return meta diff --git a/mteb/model_meta.py b/mteb/model_meta.py index fee525cba..eed74c5b4 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -32,7 +32,7 @@ "PyLate", "ColBERT", ] -DISTANCE_METRICS = Literal["cosine", "max_sim", "dot"] +DISTANCE_METRICS = Literal["cosine", "MaxSim", "dot"] def sentence_transformers_loader( @@ -111,7 +111,7 @@ def get_similarity_function(self) -> Callable[[np.ndarray, np.ndarray], np.ndarr return cos_sim elif self.similarity_fn_name == "dot": return dot_score - elif self.similarity_fn_name == "max_sim": + elif self.similarity_fn_name == "MaxSim": return max_sim elif self.similarity_fn_name is None: raise ValueError("Similarity function not specified.") diff --git a/mteb/models/__init__.py b/mteb/models/__init__.py index 1c70b528c..1389e2398 100644 --- a/mteb/models/__init__.py +++ b/mteb/models/__init__.py @@ -6,6 +6,7 @@ get_model, get_model_meta, get_model_metas, + model_meta_from_cross_encoder, model_meta_from_sentence_transformers, ) from mteb.models.sentence_transformer_wrapper import SentenceTransformerWrapper @@ -17,5 +18,6 @@ "get_model_meta", "get_model_metas", "model_meta_from_sentence_transformers", + "model_meta_from_cross_encoder", "SentenceTransformerWrapper", ] diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index f4baca358..0a8c0e4a5 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -161,7 +161,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: max_tokens=180, # Reduced for Benchmarking - see ColBERT paper embed_dim=None, # Bag of Embeddings (128) for each token license="mit", - similarity_fn_name="max_sim", + similarity_fn_name="MaxSim", framework=["PyLate", "ColBERT"], reference="https://huggingface.co/colbert-ir/colbertv2.0", use_instructions=False, @@ -213,7 +213,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: max_tokens=8192, embed_dim=None, # Bag of Embeddings (128) for each token license="cc-by-nc-4.0", - similarity_fn_name="max_sim", + similarity_fn_name="MaxSim", framework=["PyLate", "ColBERT"], reference="https://huggingface.co/jinaai/jina-colbert-v2", use_instructions=False, diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 93eaa9ab5..c72fe2ed8 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -6,7 +6,7 @@ from typing import Any from huggingface_hub import ModelCard -from sentence_transformers import SentenceTransformer +from sentence_transformers import CrossEncoder, SentenceTransformer from mteb.abstasks.AbsTask import AbsTask from mteb.encoder_interface import Encoder @@ -172,6 +172,11 @@ def get_model(model_name: str, revision: str | None = None, **kwargs: Any) -> En if not meta.similarity_fn_name: meta.similarity_fn_name = _meta.similarity_fn_name + elif isinstance(model, CrossEncoder): + _meta = model_meta_from_cross_encoder(model.model) + if meta.revision is None: + meta.revision = _meta.revision if _meta.revision else meta.revision + model.mteb_model_meta = meta # type: ignore return model @@ -251,6 +256,49 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: ) +def model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta: + try: + name = model.model.name_or_path + + meta = ModelMeta( + name=name, + revision=model.config._commit_hash, + release_date=None, + languages=None, + framework=["Sentence Transformers"], + similarity_fn_name=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + use_instructions=None, + training_datasets=None, + ) + except AttributeError as e: + logger.warning( + f"Failed to extract metadata from model: {e}. Upgrading to sentence-transformers v3.0.0 or above is recommended." + ) + meta = ModelMeta( + name=None, + revision=None, + languages=None, + release_date=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=[], + ) + return meta + + def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMeta: try: name = ( @@ -263,6 +311,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe if isinstance(model.model_card_data.language, str) else model.model_card_data.language ) + embeddings_dim = model.get_sentence_embedding_dimension() meta = ModelMeta( name=name, revision=model.model_card_data.base_model_revision, @@ -272,7 +321,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe similarity_fn_name=model.similarity_fn_name, n_parameters=None, max_tokens=None, - embed_dim=None, + embed_dim=embeddings_dim, license=None, open_weights=True, public_training_code=None, diff --git a/tests/test_benchmark/test_benchmark.py b/tests/test_benchmark/test_benchmark.py index 1393d46f1..37a226f73 100644 --- a/tests/test_benchmark/test_benchmark.py +++ b/tests/test_benchmark/test_benchmark.py @@ -41,7 +41,7 @@ def test_mulitple_mteb_tasks(tasks: list[AbsTask], model: mteb.Encoder, tmp_path: Path): """Test that multiple tasks can be run""" eval = mteb.MTEB(tasks=tasks) - eval.run(model, output_folder=str(tmp_path), overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) # ensure that we can generate a readme from the output folder generate_readme(tmp_path) @@ -56,7 +56,9 @@ def test_mulitple_mteb_tasks(tasks: list[AbsTask], model: mteb.Encoder, tmp_path MockTorchbf16Encoder(), ], ) -def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder): +def test_benchmark_encoders_on_task( + task: str | AbsTask, model: mteb.Encoder, tmp_path: Path +): """Test that a task can be fetched and run using a variety of encoders""" if isinstance(task, str): tasks = mteb.get_tasks(tasks=[task]) @@ -64,7 +66,7 @@ def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder): tasks = [task] eval = mteb.MTEB(tasks=tasks) - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix()) @pytest.mark.parametrize("task", [MockMultilingualRetrievalTask()]) @@ -72,7 +74,9 @@ def test_benchmark_encoders_on_task(task: str | AbsTask, model: mteb.Encoder): "model", [MockSentenceTransformer()], ) -def test_run_eval_without_co2_tracking(task: str | AbsTask, model: mteb.Encoder): +def test_run_eval_without_co2_tracking( + task: str | AbsTask, model: mteb.Encoder, tmp_path: Path +): """Test that a task can be fetched and run without CO2 tracking""" if isinstance(task, str): tasks = mteb.get_tasks(tasks=[task]) @@ -80,9 +84,7 @@ def test_run_eval_without_co2_tracking(task: str | AbsTask, model: mteb.Encoder) tasks = [task] eval = mteb.MTEB(tasks=tasks) - eval.run( - model, output_folder="tests/results", overwrite_results=True, co2_tracker=False - ) + eval.run(model, output_folder=tmp_path.as_posix(), co2_tracker=False) @pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID[:1]) @@ -95,20 +97,22 @@ def test_reload_results(task: str | AbsTask, model: mteb.Encoder, tmp_path: Path tasks = [task] eval = mteb.MTEB(tasks=tasks) - results = eval.run(model, output_folder=str(tmp_path), overwrite_results=True) + results = eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) assert isinstance(results, list) assert isinstance(results[0], mteb.TaskResult) # reload the results - results = eval.run(model, output_folder=str(tmp_path), overwrite_results=False) + results = eval.run( + model, output_folder=tmp_path.as_posix(), overwrite_results=False + ) assert isinstance(results, list) assert isinstance(results[0], mteb.TaskResult) @pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID) -def test_prompt_name_passed_to_all_encodes(task_name: str | AbsTask): +def test_prompt_name_passed_to_all_encodes(task_name: str | AbsTask, tmp_path: Path): """Test that all tasks correctly pass down the prompt_name to the encoder which supports it, and that the encoder which does not support it does not receive it. """ @@ -141,17 +145,17 @@ def encode(self, sentences, **kwargs): eval.run( model, - output_folder="tests/results", + output_folder=tmp_path.as_posix(), overwrite_results=True, ) # Test that the task_name is not passed down to the encoder model = EncoderWithoutInstructions("average_word_embeddings_levy_dependency") assert model.prompts == {}, "The encoder should not have any prompts" - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) @pytest.mark.parametrize("task_name", MOCK_TASK_TEST_GRID) -def test_encode_kwargs_passed_to_all_encodes(task_name: str | AbsTask): +def test_encode_kwargs_passed_to_all_encodes(task_name: str | AbsTask, tmp_path: Path): """Test that all tasks correctly pass down the encode_kwargs to the encoder.""" my_encode_kwargs = {"no_one_uses_this_args": "but_its_here"} @@ -175,14 +179,14 @@ def encode(self, sentences, task_name: str | None = None, **kwargs): model = MockEncoderWithKwargs() eval.run( model, - output_folder="tests/results", + output_folder=tmp_path.as_posix(), overwrite_results=True, encode_kwargs=my_encode_kwargs, ) @pytest.mark.parametrize("model", [MockNumpyEncoder()]) -def test_run_using_benchmark(model: mteb.Encoder): +def test_run_using_benchmark(model: mteb.Encoder, tmp_path: Path): """Test that a benchmark object can be run using the MTEB class.""" bench = Benchmark( name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"]) @@ -190,12 +194,12 @@ def test_run_using_benchmark(model: mteb.Encoder): eval = mteb.MTEB(tasks=[bench]) eval.run( - model, output_folder="tests/results", overwrite_results=True + model, output_folder=tmp_path.as_posix(), overwrite_results=True ) # we just want to test that it runs @pytest.mark.parametrize("model", [MockNumpyEncoder()]) -def test_run_using_list_of_benchmark(model: mteb.Encoder): +def test_run_using_list_of_benchmark(model: mteb.Encoder, tmp_path: Path): """Test that a list of benchmark objects can be run using the MTEB class.""" bench = [ Benchmark(name="test_bench", tasks=mteb.get_tasks(tasks=["STS12", "SummEval"])) @@ -203,7 +207,7 @@ def test_run_using_list_of_benchmark(model: mteb.Encoder): eval = mteb.MTEB(tasks=bench) eval.run( - model, output_folder="tests/results", overwrite_results=True + model, output_folder=tmp_path.as_posix() ) # we just want to test that it runs @@ -229,7 +233,7 @@ def test_get_benchmark(name): @pytest.mark.parametrize("task", MOCK_TASK_TEST_GRID) @pytest.mark.parametrize("is_task_name", [True, False]) def test_prompt_name_passed_to_all_encodes_with_prompts( - task: AbsTask | str, is_task_name: bool + task: AbsTask | str, is_task_name: bool, tmp_path: Path ): """Test that all tasks and task_types correctly pass down the prompt_name to the encoder with prompts.""" _task_name = task.metadata.name if isinstance(task, AbsTask) else task @@ -258,8 +262,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): ) eval.run( model, - output_folder="tests/results", - overwrite_results=True, + output_folder=tmp_path.as_posix(), ) class MockEncoderWithExistingPrompts(mteb.Encoder): @@ -275,7 +278,7 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): model = MockSentenceTransformerWrapper(MockEncoderWithExistingPrompts()) eval.run( model, - output_folder="tests/results", + output_folder=tmp_path.as_posix(), overwrite_results=True, ) @@ -292,7 +295,9 @@ def encode(self, sentences, prompt_name: str | None = None, **kwargs): ], ) @pytest.mark.parametrize("is_task_name", [True, False]) -def test_model_query_passage_prompts_task_type(task: AbsTask | str, is_task_name: bool): +def test_model_query_passage_prompts_task_type( + task: AbsTask | str, is_task_name: bool, tmp_path: Path +): """Test that the model with prompts is correctly called.""" tasks = [task] @@ -331,8 +336,7 @@ def encode(self, sentences, prompt_name: str | None = None, *args, **kwargs): eval.run( model, model_prompts=prompt_list, - output_folder="tests/results", - overwrite_results=True, + output_folder=tmp_path.as_posix(), ) model = MockSentenceTransformerWrapper( MockSentenceEncoderWithPrompts(), model_prompts=prompt_list @@ -341,6 +345,5 @@ def encode(self, sentences, prompt_name: str | None = None, *args, **kwargs): eval.run( model, model_prompts=prompt_list, - output_folder="tests/results", - overwrite_results=True, + output_folder=tmp_path.as_posix(), ) diff --git a/tests/test_benchmark/test_benchmark_integration_with_datasets.py b/tests/test_benchmark/test_benchmark_integration_with_datasets.py index 81d4c6b67..8288680c3 100644 --- a/tests/test_benchmark/test_benchmark_integration_with_datasets.py +++ b/tests/test_benchmark/test_benchmark_integration_with_datasets.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +from pathlib import Path import pytest @@ -18,7 +19,7 @@ @pytest.mark.parametrize("task", TASK_TEST_GRID) @pytest.mark.parametrize("model", [MockNumpyEncoder()]) -def test_benchmark_datasets(task: str | AbsTask, model: mteb.Encoder): +def test_benchmark_datasets(task: str | AbsTask, model: mteb.Encoder, tmp_path: Path): """Test that a task can be fetched and run""" eval = MTEB(tasks=[task]) - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) diff --git a/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py b/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py index 4ca0056cd..e79515be5 100644 --- a/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py +++ b/tests/test_benchmark/test_benchmark_integration_with_sentencetransformers.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +from pathlib import Path import pytest from sentence_transformers import SentenceTransformer @@ -22,9 +23,11 @@ "average_word_embeddings_levy_dependency", ], ) -def test_benchmark_sentence_transformer(task: str | AbsTask, model_name: str): +def test_benchmark_sentence_transformer( + task: str | AbsTask, model_name: str, tmp_path: Path +): """Test that a task can be fetched and run""" if isinstance(model_name, str): model = SentenceTransformer(model_name) eval = MTEB(tasks=[task]) - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) diff --git a/tests/test_benchmark/test_models.py b/tests/test_benchmark/test_models.py index ee5bed091..5d6cc1a02 100644 --- a/tests/test_benchmark/test_models.py +++ b/tests/test_benchmark/test_models.py @@ -1,5 +1,8 @@ from __future__ import annotations +import sys +from pathlib import Path + import pytest import mteb @@ -9,9 +12,10 @@ from .mock_tasks import MockRetrievalTask +@pytest.mark.skipif(sys.version_info < (3, 10), reason="Requires Python 3.10 or higher") @pytest.mark.parametrize("model", ["colbert-ir/colbertv2.0"]) @pytest.mark.parametrize("task", [MockRetrievalTask()]) -def test_colbert_model_e2e(task: AbsTask, model: str): +def test_colbert_model_e2e(task: AbsTask, model: str, tmp_path: Path): pytest.importorskip("pylate", reason="pylate not installed") eval_splits = ["test"] model = mteb.get_model(model) @@ -21,13 +25,14 @@ def test_colbert_model_e2e(task: AbsTask, model: str): model, eval_splits=eval_splits, corpus_chunk_size=500, + output_folder=tmp_path.as_posix(), ) result = results[0] assert result.scores["test"][0]["ndcg_at_1"] == 1.0 -def test_bm25s_e2e(): +def test_bm25s_e2e(tmp_path: Path): # fails for dataset smaller then 1000 pytest.importorskip("bm25s", reason="bm25s not installed") pytest.importorskip("Stemmer", reason="PyStemmer not installed") @@ -38,7 +43,9 @@ def test_bm25s_e2e(): evaluation = MTEB(tasks=tasks) - results = evaluation.run(model, eval_splits=eval_splits) + results = evaluation.run( + model, eval_splits=eval_splits, output_folder=tmp_path.as_posix() + ) result = results[0] assert result.scores["test"][0]["ndcg_at_1"] == 0.42879 diff --git a/tests/test_cli.py b/tests/test_cli.py index 7c71528f0..fc4a46811 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -50,12 +50,13 @@ def test_run_task( model_name: str, task_name: str, model_revision: str, + tmp_path: Path, ): args = Namespace( model=model_name, tasks=[task_name], model_revision=model_revision, - output_folder="tests/results/test_model", + output_folder=tmp_path.as_posix(), verbosity=3, device=None, categories=None, @@ -71,9 +72,7 @@ def test_run_task( run(args) model_name_as_path = model_name.replace("/", "__").replace(" ", "_") - results_path = Path( - f"tests/results/test_model/{model_name_as_path}/{model_revision}" - ) + results_path = tmp_path / model_name_as_path / model_revision assert results_path.exists(), "Output folder not created" assert "model_meta.json" in [ f.name for f in list(results_path.glob("*.json")) @@ -122,7 +121,7 @@ def test_create_meta(): ), f"Value for {key} does not match" # ensure that the command line interface works as well - command = f"{sys.executable} -m mteb create_meta --results_folder {results} --output_path {output_path} --overwrite" + command = f"{sys.executable} -m mteb create_meta --results_folder {results.as_posix()} --output_path {output_path.as_posix()} --overwrite" result = subprocess.run(command, shell=True, capture_output=True, text=True) assert result.returncode == 0, "Command failed" @@ -134,14 +133,16 @@ def test_create_meta(): ("model_card_without_frontmatter.md", "model_card_gold_without_frontmatter.md"), ], ) -def test_create_meta_from_existing(existing_readme_name: str, gold_readme_name: str): +def test_create_meta_from_existing( + existing_readme_name: str, gold_readme_name: str, tmp_path: Path +): """Test create_meta function directly as well as through the command line interface""" test_folder = Path(__file__).parent output_folder = test_folder / "create_meta" results = ( output_folder / "all-MiniLM-L6-v2" / "8b3219a92973c328a8e22fadcfa821b5dc75636a" ) - output_path = output_folder / "model_card.md" + output_path = tmp_path / "model_card.md" existing_readme = output_folder / existing_readme_name args = Namespace( @@ -183,7 +184,7 @@ def test_create_meta_from_existing(existing_readme_name: str, gold_readme_name: ), f"Value for {key} does not match" assert readme_output == gold_readme # ensure that the command line interface works as well - command = f"{sys.executable} -m mteb create_meta --results_folder {results} --output_path {output_path} --from_existing {existing_readme} --overwrite" + command = f"{sys.executable} -m mteb create_meta --results_folder {results.as_posix()} --output_path {output_path.as_posix()} --from_existing {existing_readme.as_posix()} --overwrite" result = subprocess.run(command, shell=True, capture_output=True, text=True) assert result.returncode == 0, "Command failed" diff --git a/tests/test_model_meta/test_model_meta.py b/tests/test_model_meta/test_model_meta.py new file mode 100644 index 000000000..2d23bc66c --- /dev/null +++ b/tests/test_model_meta/test_model_meta.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest +from sentence_transformers import CrossEncoder, SentenceTransformer + +from mteb import MTEB +from mteb.abstasks import AbsTask +from tests.test_benchmark.mock_tasks import MockRetrievalTask + + +def test_create_model_meta_from_sentence_transformers(): + model_name = "sentence-transformers/average_word_embeddings_levy_dependency" + revision = "6d9c09a789ad5dd126b476323fccfeeafcd90509" + model = SentenceTransformer(model_name, revision=revision) + + meta = MTEB.create_model_meta(model) + + assert meta.similarity_fn_name == "cosine" + assert meta.embed_dim == model.get_sentence_embedding_dimension() + assert type(meta.framework) is list + assert meta.framework[0] == "Sentence Transformers" + assert meta.name == model_name + assert meta.revision == revision + + +def test_create_model_meta_from_cross_encoder(): + model_name = "cross-encoder/ms-marco-TinyBERT-L-2-v2" + revision = "841d331b6f34b15d6ac0ab366ae3a3b36eeac691" + model = CrossEncoder(model_name, revision=revision) + + meta = MTEB.create_model_meta(model) + + assert meta.name == model_name + assert meta.revision == revision + + return meta + + +@pytest.mark.parametrize("task", [MockRetrievalTask()]) +def test_output_folder_model_meta(task: AbsTask, tmp_path: Path): + mteb = MTEB(tasks=[task]) + model_name = "cross-encoder/ms-marco-TinyBERT-L-2-v2" + model = CrossEncoder(model_name) + meta = mteb.create_model_meta(model) + output_path = mteb.create_output_folder( + model_meta=meta, output_folder=tmp_path.as_posix() + ) + + output_path = Path(output_path) + assert output_path.exists() + assert output_path.is_dir() + assert output_path.name == model.config._commit_hash + assert output_path.parent.name == "cross-encoder__ms-marco-TinyBERT-L-2-v2" + assert output_path.parent.parent == tmp_path + + +@pytest.mark.skipif(sys.version_info < (3, 10), reason="Requires Python 3.10 or higher") +def test_model_meta_colbert(): + model_name = "colbert-ir/colbertv2.0" + colbert_model = pytest.importorskip("pylate.models", reason="pylate not installed") + revision = "c1e84128e85ef755c096a95bdb06b47793b13acf" + model = colbert_model.ColBERT(model_name, revision=revision) + + meta = MTEB.create_model_meta(model) + + # assert meta.similarity_fn_name == "MaxSim" test with new release of pylate + assert type(meta.framework) is list + assert meta.framework[0] == "Sentence Transformers" + assert meta.name == model_name + assert meta.revision == revision diff --git a/tests/test_reproducible_workflow.py b/tests/test_reproducible_workflow.py index 1c7536076..1973072ba 100644 --- a/tests/test_reproducible_workflow.py +++ b/tests/test_reproducible_workflow.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from pathlib import Path import pytest @@ -18,7 +19,9 @@ @pytest.mark.parametrize("task_name", ["BornholmBitextMining"]) @pytest.mark.parametrize("model_name", ["sentence-transformers/all-MiniLM-L6-v2"]) @pytest.mark.parametrize("model_revision", ["8b3219a92973c328a8e22fadcfa821b5dc75636a"]) -def test_reproducibility_workflow(task_name: str, model_name: str, model_revision: str): +def test_reproducibility_workflow( + task_name: str, model_name: str, model_revision: str, tmp_path: Path +): """Test that a model and a task can be fetched and run in a reproducible fashion.""" model_meta = mteb.get_model_meta(model_name, revision=model_revision) task = mteb.get_task(task_name) @@ -30,7 +33,7 @@ def test_reproducibility_workflow(task_name: str, model_name: str, model_revisio assert isinstance(model, Encoder) eval = MTEB(tasks=[task]) - eval.run(model, output_folder="tests/results", overwrite_results=True) + eval.run(model, output_folder=tmp_path.as_posix(), overwrite_results=True) @pytest.mark.parametrize( diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index effd76829..4a535bebb 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -339,17 +339,16 @@ def test_mteb_rerank(tmp_path: Path): eval.run( model, # type: ignore - output_folder="tests/results", + output_folder=tmp_path.as_posix(), overwrite_results=True, eval_splits=["test"], top_k=2, previous_results=tmp_file, save_predictions=True, ) - tmp_file.unlink() # read in the results - with open("tests/results/SciFact_default_predictions.json") as f: + with (tmp_path / "SciFact_default_predictions.json").open() as f: results = json.load(f) # check that only the top two results are re-orderd @@ -358,7 +357,7 @@ def test_mteb_rerank(tmp_path: Path): assert "18670" in results["1"] -def test_reranker_same_ndcg1(): +def test_reranker_same_ndcg1(tmp_path: Path): de_name = "average_word_embeddings_komninos" revision = "21eec43590414cb8e3a6f654857abed0483ae36e" de = SentenceTransformer(de_name, revision=revision) @@ -382,32 +381,35 @@ def test_reranker_same_ndcg1(): framework=["Sentence Transformers", "PyTorch"], ) eval = MTEB(tasks=mteb.get_tasks(["SciFact"])) + stage1_path = tmp_path / "stage1" eval.run( de, - output_folder="tests/results/stage1", + output_folder=stage1_path.as_posix(), overwrite_results=True, save_predictions=True, eval_splits=["test"], ) + stage2_path = tmp_path / "stage2" eval.run( ce, # type: ignore - output_folder="tests/results/stage2", + output_folder=stage2_path.as_posix(), overwrite_results=True, - previous_results="tests/results/stage1/SciFact_default_predictions.json", + previous_results=(stage1_path / "SciFact_default_predictions.json"), save_predictions=False, eval_splits=["test"], top_k=1, # don't allow it to rerank more than 1 so we can check for top_1 being the same ) # read in stage 1 and stage two and check ndcg@1 is the same - with open( - f"tests/results/stage1/sentence-transformers__{de_name}/{revision}/SciFact.json" - ) as f: + with ( + stage1_path / f"sentence-transformers__{de_name}/{revision}/SciFact.json" + ).open() as f: stage1 = json.load(f) - with open( - f"tests/results/stage2/cross-encoder__ms-marco-TinyBERT-L-2-v2/{ce_revision}/SciFact.json" - ) as f: + with ( + stage2_path + / f"cross-encoder__ms-marco-TinyBERT-L-2-v2/{ce_revision}/SciFact.json" + ).open() as f: stage2 = json.load(f) assert ( From fe330611b6e433096501d0d9814b2c644c33e984 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 22 Jan 2025 07:53:26 +0100 Subject: [PATCH 39/49] fix: Fixed leaderboard search bar (#1852) Fixed leaderboard search bar --- mteb/leaderboard/app.py | 4 ++-- mteb/leaderboard/table.py | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index f339c6142..5ee5a6b9d 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -263,7 +263,7 @@ def filter_models( with gr.Row(): searchbar = gr.Textbox( label="Search Models", - info="Search models by name (RegEx sensitive. Separate queries with `|`)", + info="Press Enter to search.\nSearch models by name (RegEx sensitive. Separate queries with `|`)", interactive=True, ) compatibility = gr.CheckboxGroup( @@ -626,7 +626,7 @@ def update_tables( inputs=[scores, searchbar, task_select, models], outputs=[summary_table, per_task_table], ) - searchbar.input( + searchbar.submit( update_tables, inputs=[scores, searchbar, task_select, models], outputs=[summary_table, per_task_table], diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 2cb5fb34b..ef28392cf 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -142,6 +142,11 @@ def scores_to_tables( names = per_task.index.get_level_values("model_name") names = pd.Series(names, index=per_task.index) to_remove |= ~names.str.contains(search_query, regex=True) + if to_remove.all(): + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return gr.DataFrame(no_results_frame), gr.DataFrame(no_results_frame) models_to_remove = list(per_task[to_remove].index) typed_mean = mean_per_type.mean(skipna=False, axis=1) overall_mean = per_task.mean(skipna=False, axis=1) From 2f8cfae0f1e004fec9ddbf6857892f27e92612ee Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 22 Jan 2025 07:12:26 +0000 Subject: [PATCH 40/49] 1.29.13 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 15fcb6978..c484b8d06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.12" +version = "1.29.13" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 4bd7328f1d43ff36564eb5941e7b32daf826f456 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 22 Jan 2025 10:26:36 +0100 Subject: [PATCH 41/49] fix: Hotfixed public_training_data type annotation (#1857) Fixed public_training_data flag type to include boolean, as this is how all models are annotated --- mteb/model_meta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/model_meta.py b/mteb/model_meta.py index c88326edc..48c8b1295 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -95,7 +95,7 @@ class ModelMeta(BaseModel): license: str | None open_weights: bool | None public_training_code: str | None - public_training_data: str | None + public_training_data: str | bool | None framework: list[FRAMEWORKS] reference: STR_URL | None = None similarity_fn_name: DISTANCE_METRICS | None From 4985da94cbc4c1368debab737fa8195f6bb91ce2 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Wed, 22 Jan 2025 12:32:42 +0300 Subject: [PATCH 42/49] fix: Fix zeta alpha mistral (#1736) * fix zeta alpha mistral * update use_instructions * update training datasets * Update mteb/models/e5_instruct.py Co-authored-by: Kenneth Enevoldsen * update float * Update mteb/models/e5_instruct.py --------- Co-authored-by: Kenneth Enevoldsen --- mteb/models/e5_instruct.py | 75 ++++++++++++++++++++++++++++++++++++++ mteb/models/misc_models.py | 22 +---------- 2 files changed, 76 insertions(+), 21 deletions(-) diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index c89b64fc7..3eed189d3 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -84,3 +84,78 @@ public_training_data=None, training_datasets=E5_TRAINING_DATA, ) + +zeta_alpha_ai__Zeta_Alpha_E5_Mistral = ModelMeta( + loader=partial( # type: ignore + instruct_wrapper, + model_name_or_path="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + instruction_template=E5_INSTRUCTION, + attn="cccc", + pooling_method="lasttoken", + mode="embedding", + torch_dtype=torch.bfloat16, + # The ST script does not normalize while the HF one does so unclear what to do + # https://huggingface.co/intfloat/e5-mistral-7b-instruct#transformers + normalized=True, + ), + name="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + revision="c791d37474fa6a5c72eb3a2522be346bc21fbfc3", + release_date="2024-08-30", + languages=["eng_Latn"], + n_parameters=7110660096, + max_tokens=32768.0, + embed_dim=4096, + license="mit", + open_weights=True, + public_training_data=None, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + similarity_fn_name="cosine", + use_instructions=True, + training_datasets={ + # copied from e5 + # source: https://arxiv.org/pdf/2212.03533 + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], # dev? + # source: https://www.zeta-alpha.com/post/fine-tuning-an-llm-for-state-of-the-art-retrieval-zeta-alpha-s-top-10-submission-to-the-the-mteb-be + # "Arguana", + # "FEVER", + # "FIQA", + # "HotPotQA", + # "MsMarco (passage)", + # "NFCorpus", + # "SciFact", + # "NLI", + # "SQuad", + # "StackExchange", + # "TriviaQA", + # "SciRep", + # "SciRepEval" + # mteb + # https://huggingface.co/datasets/mteb/raw_arxiv + # "ArxivClusteringS2S": ["train"], + # "ArxivClusteringP2P": ["train"], + # https://huggingface.co/datasets/mteb/raw_biorxiv + # "BiorxivClusteringS2S": ["train"], + # "BiorxivClusteringP2P": ["train"], + # https://huggingface.co/datasets/mteb/raw_medrxiv + # "MedrxivClusteringS2S": ["train"], + # "MedrxivClusteringP2P": ["train"], + # as their train datasets + "AmazonCounterfactualClassification": ["train"], + "AmazonReviewsClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ImdbClassification": ["train"], + "STS12": ["train"], + "STS22": ["train"], + "STSBenchmark": ["train"], + }, + adapted_from="intfloat/e5-mistral-7b-instruct", + superseded_by=None, +) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index ba6e3e816..bf41d3cdb 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -1607,27 +1607,7 @@ adapted_from="/workspace/v3-matryoshka_aubmindlab-bert-base-arabertv02-2024-10-12_13-55-06/checkpoint-26250", superseded_by=None, ) -zeta_alpha_ai__Zeta_Alpha_E5_Mistral = ModelMeta( - name="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", - revision="3e6076bdc2ff592a2f95fbc04570e51db5aa0c0c", - release_date="2024-08-30", - languages=["eng_Latn"], - loader=None, - n_parameters=7110660096, - max_tokens=32768.0, - embed_dim=4096, - license="mit", - open_weights=True, - public_training_code=None, - public_training_data=None, - framework=["PyTorch"], - reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", - similarity_fn_name="cosine", - use_instructions=None, - training_datasets=None, - adapted_from="intfloat/e5-mistral-7b-instruct", - superseded_by=None, -) + sbert_chinese_general_v1 = ModelMeta( name="DMetaSoul/sbert-chinese-general-v1", revision="bd27765956bcc2fcf682de0097819947ac10037e", From 12ed9c50debd83b7fd6f589373d1fd4539f2aa17 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Wed, 22 Jan 2025 12:33:28 +0300 Subject: [PATCH 43/49] Add more annotations (#1833) * apply additions from #1794 * add annotations for rumodels * add nomic training data * fix metadata * update rest of model meta * fix bge reranker --- mteb/models/bge_models.py | 115 ++++++------------------ mteb/models/colbert_models.py | 10 ++- mteb/models/ibm_granite_models.py | 67 +++++++++++++- mteb/models/jina_models.py | 18 +++- mteb/models/misc_models.py | 4 +- mteb/models/nomic_models.py | 96 +++++++++++++++++--- mteb/models/rerankers_custom.py | 3 +- mteb/models/ru_sentence_models.py | 144 ++++++++++++++++++++++++------ mteb/models/stella_models.py | 2 + 9 files changed, 322 insertions(+), 137 deletions(-) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index d9eb64246..001c711ed 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -7,8 +7,8 @@ model_prompts = {"query": "Represent this sentence for searching relevant passages: "} model_prompts_zh = {"query": "为这个句子生成表示以用于检索相关文章:"} -bge_m_training_data = { - # source: https://arxiv.org/pdf/2402.03216 +bge_m3_training_data = { + # source: https://arxiv.org/abs/2402.03216 "MIRACLRetrieval": ["train"], "MIRACLRetrievalHardNegatives": ["train"], "MIRACLReranking": ["train"], @@ -28,6 +28,28 @@ "HotpotQA": ["train"], "HotpotQA-PL": ["train"], # translation not trained on "HotpotQAHardNegatives": ["train"], + "T2Retrieval": ["train"], + "DuReader": ["train"], + "MMarcoReranking": ["train"], + "CodeSearchNet": ["train"], + # not in mteb + # "s2orc" + # Wikipedia + # "xP3" + # "mC4" + # "CC-News" + # "MTP" + # "NLLB" + # "CCMatrix" + # TriviaQA + # COL-IEE + # PubMedQA + # SQuAD + # SimCSE + # mMARCO-ZH + # LawGPT + # NLI-zh2, LeCaRDv2, + # NLI, MultiLongDoc (their syntetic) # + synthetic data } @@ -89,38 +111,6 @@ # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) } -bgem3_training_data = { - # source https://arxiv.org/abs/2402.03216 - "T2Retrieval": ["train"], - "DuReader": ["train"], - "MMarcoReranking": ["train"], - "CMedQAv2-reranking": ["train"], - "HotpotQA": ["train"], - "NQ": ["train"], - "MSMARCO": ["train"], - "MrTidyRetrieval": ["train"], - "MIRACLRetrieval": ["train"], - "CodeSearchNet": ["train"], - # not in mteb - # "s2orc" - # Wikipedia - # "xP3" - # "mC4" - # "CC-News" - # "MTP" - # "NLLB" - # "CCMatrix" - # TriviaQA - # COL-IEE - # PubMedQA - # SQuAD - # SimCSE - # mMARCO-ZH - # LawGPT - # NLI-zh2, LeCaRDv2, - # NLI, MultiLongDoc (their syntetic) -} - # https://huggingface.co/BAAI/bge-m3/discussions/29 bgem3_languages = [ "afr_Latn", # af @@ -298,59 +288,6 @@ "zho_Hans", # zh ] -bge_m_training_data = { - # source: https://arxiv.org/pdf/2402.03216 - "MIRACLRetrieval": ["train"], - "MIRACLRetrievalHardNegatives": ["train"], - "MIRACLReranking": ["train"], - "LeCaRDv2": ["train"], - "CMedQAv1-reranking": ["train"], - "CMedQAv2-reranking": ["train"], - "MrTidyRetrieval": ["train"], - "T2Reranking": ["train"], - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - "HotpotQA": ["train"], - "HotpotQA-PL": ["train"], # translation not trained on - "HotpotQAHardNegatives": ["train"], - # + synthetic data -} - -bge_training_data = { - # source: https://data.baai.ac.cn/details/BAAI-MTP - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], # assumed from: amazon_reviews_multi - "MLQARetrieval": [ - "validation", - "test", - ], # assumed from mlqa (question, context) - # not in mteb - # Dataset Pairs - # wudao (title, passage) - # cmrc2018 (query, context) - # dureader (query, context) - # simclue (sentence_a, sentence_b) - # csl (title, abstract) - # amazon_reviews_multi (title, body) - # wiki_atomic_edits (base_sentence, edited_sentence) - # mlqa (question, context) - # xlsum (title, summary) (title, text) - # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further - # "wikipedia": [], # title + section title, passage - # "reddit": [], # title, body - # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) - # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) -} bge_small_en_v1_5 = ModelMeta( loader=partial( # type: ignore @@ -522,8 +459,8 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, - public_training_data=None, - training_datasets=bgem3_training_data, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets=bge_m3_training_data, ) diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index 89b09de28..8bb66948a 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -165,7 +165,9 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: use_instructions=False, adapted_from=None, superseded_by=None, - training_datasets=None, + training_datasets={ + "MSMARCO": ["train"], # dev? + }, ) @@ -218,5 +220,9 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: use_instructions=False, adapted_from=None, superseded_by=None, - training_datasets=None, + training_datasets={ + "MSMARCO": ["train"], + "DuRetrieval": [], + "MIRACL": ["train"], + }, ) diff --git a/mteb/models/ibm_granite_models.py b/mteb/models/ibm_granite_models.py index 63679879c..e7c3b8b02 100644 --- a/mteb/models/ibm_granite_models.py +++ b/mteb/models/ibm_granite_models.py @@ -20,6 +20,65 @@ "zho_Hans", ] +granite_training_data = { + # Multilingual MC4 + # Multilingual Webhose + # English Wikipedia + # Multilingual Wikimedia + "WikipediaRetrievalMultilingual": [], + "WikipediaRerankingMultilingual": [], + # Miracl Corpus (Title-Body) + # Stack Exchange Duplicate questions (titles) + # Stack Exchange Duplicate questions (titles) + # Stack Exchange Duplicate questions (bodies) + "StackOverflowDupQuestions": [], + "AskUbuntuDupQuestions": [], + # Stack Exchange (Title, Answer) pairs + # Stack Exchange (Title, Body) pairs + # Stack Exchange (Title, Body) pairs + # Machine Translations of Stack Exchange Duplicate questions (titles) + # Machine Translations of Stack Exchange (Title+Body, Answer) pairs + "StackExchangeClusteringP2P": [], + "StackExchangeClusteringP2P.v2": [], + "StackExchangeClustering": [], + "StackExchangeClustering.v2": [], + # SearchQA + # S2ORC (Title, Abstract) + # WikiAnswers Duplicate question pairs + # CCNews + # XSum + # SimpleWiki + # Machine Translated Cross Lingual Parallel Corpora + # SPECTER citation triplets + # Machine Translations of SPECTER citation triplets + # Natural Questions (NQ) + "NQ": ["test"], + "NQHardNegatives": ["test"], + # SQuAD2.0 + # HotpotQA + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + # Fever + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], + # PubMed + # Multilingual Miracl Triples + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + # Multilingual MrTydi Triples + "MrTidyRetrieval": ["train"], + # Sadeeem Question Asnwering + # DBPedia Title-Body Pairs + "DBPedia": ["train"], + # Synthetic: English Query-Wikipedia Passage + # Synthetic: English Fact Verification + # Synthetic: Multilingual Query-Wikipedia Passage + # Synthetic: Multilingual News Summaries + # IBM Internal Triples + # IBM Internal Title-Body Pairs +} granite_107m_multilingual = ModelMeta( loader=partial( # type: ignore @@ -44,7 +103,7 @@ public_training_code=None, public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) granite_278m_multilingual = ModelMeta( @@ -70,7 +129,7 @@ public_training_code=None, public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) granite_30m_english = ModelMeta( @@ -96,7 +155,7 @@ public_training_code=None, public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) granite_125m_english = ModelMeta( @@ -122,5 +181,5 @@ public_training_code=None, public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index 4f1b58a35..e855ad3c7 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -222,9 +222,25 @@ def encode( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, reference="https://huggingface.co/jinaai/jina-embeddings-v3", - training_datasets=None, public_training_code=None, public_training_data=None, + training_datasets={ + # CulturaX + "STS12": [], + # "SICK": [], + # "WMT19": [], + # "MADLAD-3B": [], + # NLI + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # oasst1, oasst2 + }, + adapted_from="XLM-RoBERTa", ) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index bf41d3cdb..ba12b0bb5 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -7,7 +7,7 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader from mteb.models.e5_models import E5_TRAINING_DATA -from .bge_models import bge_m_training_data, bge_training_data +from .bge_models import bge_m3_training_data, bge_training_data from .sentence_transformers_models import sent_trf_training_dataset Haon_Chen__speed_embedding_7b_instruct = ModelMeta( @@ -1445,7 +1445,7 @@ reference="https://huggingface.co/deepvk/USER-bge-m3", similarity_fn_name="cosine", use_instructions=None, - training_datasets=bge_m_training_data, # derived from. + training_datasets=bge_m3_training_data, # derived from. # not in MTEB: # "deepvk/ru-HNP": ["train"], # "deepvk/ru-WANLI": ["train"], diff --git a/mteb/models/nomic_models.py b/mteb/models/nomic_models.py index 772d92902..15c7df123 100644 --- a/mteb/models/nomic_models.py +++ b/mteb/models/nomic_models.py @@ -90,6 +90,79 @@ def encode( # type: ignore return emb +nomic_training_data = { + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/data/contrastive_pretrain.yaml + # reddit_title_body + "RedditClustering": [], + "RedditClusteringP2P": [], + "RedditClustering.v2": [], + "RedditClusteringP2P.v2": [], + # amazon_reviews + # amazonqa + "AmazonPolarityClassification": [], + "AmazonReviewsClassification": [], + "AmazonCounterfactualClassification": [], + # paq + # s2orc_citation_titles + # s2orc_title_abstract + # s2orc_abstract_citation + # s2orc_abstract_body + # wikianswers + # wikipedia + "WikipediaRetrievalMultilingual": [], + "WikipediaRerankingMultilingual": [], + # gooaq + # codesearch + "CodeSearchNetCCRetrieval": [], + "COIRCodeSearchNetRetrieval": [], + # yahoo_title_answer + # yahoo_qa + # yahoo_title_question + "YahooAnswersTopicsClassification": [], + # agnews + # ccnews + # npr + # eli5 + # cnn + # stackexchange_duplicate_questions + # stackexchange_title_body + # stackexchange_body_body + "StackExchangeClustering.v2": [], + "StackExchangeClusteringP2P.v2": [], + # sentence_compression + # wikihow + # altlex + # quora + "QuoraRetrieval": [], + "NanoQuoraRetrieval": [], + # simplewiki + # squad + "FQuADRetrieval": [], + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/data/finetune_triplets.yaml + # msmaro + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + # nq_triples + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # nli_triplets + # reddit + # medi_wiki + # medi_stackexchange + # medi_flickr + # medi_supernli + # hotpot + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + # fever + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], +} + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/eval/mteb_eval/eval_mteb.py#L142-L159 model_prompts = { "Classification": "classification: ", @@ -126,9 +199,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, - public_training_code=None, public_training_data=None, - training_datasets=None, + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, ) nomic_embed_v1 = ModelMeta( @@ -154,9 +227,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by="nomic-ai/nomic-embed-text-v1.5", - public_training_code=None, + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, public_training_data=None, - training_datasets=None, ) nomic_embed_v1_ablated = ModelMeta( @@ -182,9 +255,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, - public_training_code=None, + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, public_training_data=None, - training_datasets=None, ) @@ -211,9 +284,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, - public_training_code=None, + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, public_training_data=None, - training_datasets=None, ) nomic_modern_bert_embed = ModelMeta( @@ -239,9 +312,10 @@ def encode( # type: ignore similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - adapted_from=None, + adapted_from="answerdotai/ModernBERT-base", + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_pretrain_modernbert.yaml", + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune_modernnomic.yaml superseded_by=None, - public_training_code=None, + training_datasets=nomic_training_data, public_training_data=None, - training_datasets=None, ) diff --git a/mteb/models/rerankers_custom.py b/mteb/models/rerankers_custom.py index 1a0fd1f6b..0e2c8d8f7 100644 --- a/mteb/models/rerankers_custom.py +++ b/mteb/models/rerankers_custom.py @@ -11,6 +11,7 @@ from mteb.encoder_interface import Encoder from mteb.evaluation.evaluators.RetrievalEvaluator import DenseRetrievalExactSearch from mteb.model_meta import ModelMeta +from mteb.models.bge_models import bge_m3_training_data logger = logging.getLogger(__name__) @@ -294,6 +295,6 @@ def loader_inner(**kwargs: Any) -> Encoder: public_training_data=None, similarity_fn_name=None, use_instructions=None, - training_datasets=None, + training_datasets=bge_m3_training_data, framework=["Sentence Transformers", "PyTorch"], ) diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index 297c7f314..a91b6e728 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -6,44 +6,53 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader -from .bge_models import bge_training_data +from .bge_models import bge_m3_training_data -rubert_tiny2 = ModelMeta( - name="cointegrated/rubert-tiny2", +rubert_tiny = ModelMeta( + name="cointegrated/rubert-tiny", languages=["rus_Cyrl"], open_weights=True, - revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3", - release_date="2021-10-28", + revision="5441c5ea8026d4f6d7505ec004845409f1259fb1", + release_date="2021-05-24", n_parameters=29_400_000, embed_dim=312, license="mit", max_tokens=2048, - reference="https://huggingface.co/cointegrated/rubert-tiny2", + reference="https://huggingface.co/cointegrated/rubert-tiny", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code=None, + public_training_code="https://gist.github.com/avidale/7bc6350f26196918bf339c01261f5c60", + training_datasets={ + # [Yandex Translate corpus](https://translate.yandex.ru/corpus), [OPUS-100](https://huggingface.co/datasets/opus100) + "Tatoeba": ["train"], + }, + adapted_from="google-bert/bert-base-multilingual-cased", public_training_data=None, - training_datasets=None, ) -rubert_tiny = ModelMeta( - name="cointegrated/rubert-tiny", +rubert_tiny2 = ModelMeta( + name="cointegrated/rubert-tiny2", languages=["rus_Cyrl"], open_weights=True, - revision="5441c5ea8026d4f6d7505ec004845409f1259fb1", - release_date="2021-05-24", + revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3", + release_date="2021-10-28", n_parameters=29_400_000, embed_dim=312, license="mit", max_tokens=2048, - reference="https://huggingface.co/cointegrated/rubert-tiny", + reference="https://huggingface.co/cointegrated/rubert-tiny2", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code=None, + public_training_code="https://colab.research.google.com/drive/1mSWfIQ6PIlteLVZ9DKKpcorycgLIKZLf?usp=sharing", + training_datasets={ + # https://huggingface.co/datasets/cointegrated/ru-paraphrase-NMT-Leipzig + # Wikipedia https://huggingface.co/datasets/Madjogger/JamSpell_dataset + # https://huggingface.co/datasets/imvladikon/leipzig_corpora_collection + }, + adapted_from="cointegrated/rubert-tiny", public_training_data=None, - training_datasets=None, ) sbert_large_nlu_ru = ModelMeta( @@ -81,7 +90,10 @@ use_instructions=False, public_training_code=None, public_training_data=None, - training_datasets=None, + training_datasets={ + # SNLI, MNLI + # https://github.com/brmson/dataset-sts + }, ) user_base_ru = ModelMeta( @@ -97,20 +109,76 @@ revision="436a489a2087d61aa670b3496a9915f84e46c861", release_date="2024-06-10", n_parameters=427_000_000, - embed_dim=1024, - license="Not specified", - max_tokens=512, # best guess - reference="https://huggingface.co/ai-forever/sbert_large_mt_nlu_ru", + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/deepvk/USER-base", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], + adapted_from="https://huggingface.co/deepvk/deberta-v1-base", use_instructions=True, + training_datasets={ + "BibleNLPBitextMining": ["train"], + # https://github.com/unicamp-dl/mMARCO + # deepvk/ru-HNP + # deepvk/ru-WANLI + # MedNLI + # RCB + "TERRa": ["train"], + # Tapaco + # Opus100 + # BiblePar + # RudetoxifierDataDetox + # RuParadetox + "MIRACL": ["train"], + # MLDR + # Lenta + "MLSUMClusteringP2P": ["train"], + "MLSUMClusteringP2P.v2": ["train"], + "MLSUMClusteringS2S": ["train"], + "MLSUMClusteringS2S.v2": ["train"], + "MrTidyRetrieval": ["train"], + # "Panorama" + # PravoIsrael + # xlsum + # Fialka-v1 + # RussianKeywords + # Gazeta + # Gsm8k-ru + # DSumRu + # SummDialogNews + }, + public_training_code=None, + public_training_data=None, +) + +user_bge_m3 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="deepvk/USER-bge-m3", + revision="0cc6cfe48e260fb0474c753087a69369e88709ae", + ), + name="deepvk/USER-bge-m3", + languages=["rus_Cyrl"], + open_weights=True, + revision="0cc6cfe48e260fb0474c753087a69369e88709ae", + release_date="2024-07-05", + n_parameters=359_026_688, + embed_dim=1024, + license="apache-2.0", + max_tokens=8194, + reference="https://huggingface.co/deepvk/USER-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + adapted_from="https://huggingface.co/BAAI/bge-m3", + use_instructions=False, training_datasets={ "BibleNLPBitextMining": ["train"], "MLSUMClusteringP2P": ["train"], "MLSUMClusteringP2P.v2": ["train"], "MLSUMClusteringS2S": ["train"], "MLSUMClusteringS2S.v2": ["train"], - **bge_training_data, + **bge_m3_training_data, # not MTEB: # "deepvk/ru-HNP": ["train"], # "deepvk/ru-WANLI": ["train"], @@ -145,6 +213,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + # Wikipedia, Books, Twitter comments, Pikabu, Proza.ru, Film subtitles, News websites, and Social corpus public_training_code=None, public_training_data=None, training_datasets=None, @@ -159,7 +228,7 @@ n_parameters=1280_000_000, embed_dim=768, license="Not specified", - max_tokens=512, # best guess + max_tokens=512, reference="https://huggingface.co/DeepPavlov/rubert-base-cased", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], @@ -204,7 +273,10 @@ use_instructions=False, public_training_code=None, public_training_data=None, - training_datasets=None, + training_datasets={ + # "SNLI": [], + "XNLI": ["dev"] + }, ) labse_en_ru = ModelMeta( @@ -221,9 +293,10 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code=None, + public_training_code="https://colab.research.google.com/drive/1dnPRn0-ugj3vZgSpyCC9sgslM2SuSfHy?usp=sharing", public_training_data=None, training_datasets=None, + adapted_from="sentence-transformers/LaBSE", ) rubert_tiny_turbo = ModelMeta( @@ -244,6 +317,7 @@ public_training_data=None, training_datasets=None, # source model in unknown # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + adapted_from="cointegrated/rubert-tiny2", ) labse_ru_turbo = ModelMeta( @@ -260,9 +334,10 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets=None, # source model in unknown + training_datasets=None, # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, public_training_code=None, + adapted_from="cointegrated/LaBSE-en-ru", public_training_data=None, ) @@ -290,8 +365,23 @@ embed_dim=1024, license="mit", similarity_fn_name="cosine", - public_training_code=None, + adapted_from="ai-forever/ruRoberta-large", + training_datasets={ + # https://huggingface.co/ai-forever/ruRoberta-large + # https://huggingface.co/datasets/IlyaGusev/yandex_q_full + # https://huggingface.co/datasets/IlyaGusev/pikabu + # https://huggingface.co/datasets/IlyaGusev/ru_stackoverflow + # https://huggingface.co/datasets/IlyaGusev/habr + # https://huggingface.co/datasets/its5Q/habr_qna + # NewsCommentary + # MultiParaCrawl + "XNLI": [], + "XNLIV2": [], + "LanguageClassification": [], # XNLI + "MIRACLReranking": ["train"], + "MIRACLRetrieval": ["train"], + }, public_training_data=None, - training_datasets=None, + public_training_code=None, framework=["Sentence Transformers", "PyTorch"], ) diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 7210b287c..92d5db7c8 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -29,6 +29,7 @@ framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_400M_v5", training_datasets=None, + # will be at https://github.com/NLPJCL/RAG-Retrieval public_training_code=None, public_training_data=None, ) @@ -55,6 +56,7 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", + # will be at https://github.com/NLPJCL/RAG-Retrieval training_datasets=None, public_training_code=None, public_training_data=None, From fde446d06ca1b0a779bb771dd578b43eb41c7c87 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 22 Jan 2025 09:41:05 +0000 Subject: [PATCH 44/49] 1.29.14 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c484b8d06..18b551f19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.13" +version = "1.29.14" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 692bd265e731c934d8318c497b954e271540a6ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 22 Jan 2025 12:34:01 +0100 Subject: [PATCH 45/49] fix: Adding missing model meta (#1856) * Added CDE models * Added bge-en-icl * Updated CDE to bge_full_data * Fixed public_training_data flag type to include boolean, as this is how all models are annotated * Added public training data link instead of bool to CDE and BGE * Added GME models * Changed Torch to PyTorch * Added metadata on LENS models * Added ember_v1 * Added metadata for amazon titan * Removed GME implementation --- mteb/models/bge_models.py | 82 ++++++++++++++++++++++++++++++++++++++ mteb/models/cde_models.py | 54 +++++++++++++++++++++++++ mteb/models/gme_models.py | 63 +++++++++++++++++++++++++++++ mteb/models/lens_models.py | 49 +++++++++++++++++++++++ mteb/models/misc_models.py | 38 ++++++++++++++++++ mteb/models/overview.py | 6 +++ 6 files changed, 292 insertions(+) create mode 100644 mteb/models/cde_models.py create mode 100644 mteb/models/gme_models.py create mode 100644 mteb/models/lens_models.py diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 001c711ed..6529b0804 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -4,6 +4,8 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader +from .e5_instruct import E5_MISTRAL_TRAINING_DATA + model_prompts = {"query": "Represent this sentence for searching relevant passages: "} model_prompts_zh = {"query": "为这个句子生成表示以用于检索相关文章:"} @@ -496,3 +498,83 @@ public_training_data=None, training_datasets=None, # not disclosed ) + +# Contents of cfli/bge-full-data +bge_full_data = { + # source: https://arxiv.org/pdf/2409.15700 + # Charles Goodhart is turning back and forth + # in his grave as I'm annotating this + # |Retrieval| + # ELI5 + # SQuaD + # TriviaQA + # QuoraDuplicateQuestions + "HotpotQA": ["train"], + "FEVER": ["train"], + "MSMARCO": ["train"], + "NQ": ["train"], + "ArguAna": ["train"], + "FiQA2018": ["train"], + # |Reranking| + "SciDocsReranking": ["train"], + "StackOverflowDupQuestions": ["train"], + # |Classification| + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ImdbClassification": ["train"], + "ToxicConversationsClassification": ["train"], + # |Clustering| + "ArxivClusteringS2S": ["train"], + "ArxivClusteringP2P": ["train"], + "BiorxivClusteringS2S": ["train"], + "BiorxivClusteringP2P": ["train"], + "MedrxivClusteringS2S": ["train"], + "MedrxivClusteringP2P": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "RedditClusteringP2P": ["train"], + "RedditClustering": ["train"], + "RedditClustering.v2": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwentyNewsgroupsClustering.v2": ["train"], + # |STS| + "STS22": ["train"], + "STS22.v2": ["train"], + "STSBenchmark": ["train"], +} + +bge_en_icl = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="BAAI/bge-en-icl", + revision="971c7e1445cc86656ca0bd85ed770b8675a40bb5", + ), + name="BAAI/bge-en-icl", + languages=[ + "eng_Latn", + ], + open_weights=True, + revision="971c7e1445cc86656ca0bd85ed770b8675a40bb5", + release_date="2024-07-25", # initial commit of hf model. + n_parameters=7.11 * 1e9, + embed_dim=4096, + license="apache-2", + max_tokens=32768, + reference="https://huggingface.co/BAAI/bge-en-icl", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code="https://github.com/FlagOpen/FlagEmbedding", + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets={ + **E5_MISTRAL_TRAINING_DATA, + **bge_full_data, + }, + adapted_from="intfloat/e5-mistral-7b-instruct", +) diff --git a/mteb/models/cde_models.py b/mteb/models/cde_models.py new file mode 100644 index 000000000..78870ef12 --- /dev/null +++ b/mteb/models/cde_models.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import logging + +from mteb.model_meta import ModelMeta + +from .bge_models import bge_full_data + +logger = logging.getLogger(__name__) + + +cde_small_v1 = ModelMeta( + loader=None, # I will leave this at None for now, + name="jxm/cde-small-v1", + languages=["eng_Latn"], + open_weights=True, + revision="8d5736163718a8b65cd787b75ed61020d18bad3c", + release_date="2024-09-24", + n_parameters=int(281 * 1e6), # Though the second-stage model is only 140M + max_tokens=512, + embed_dim=768, + license="mit", + similarity_fn_name="cosine", + framework=["Sentence Transformers"], + reference="https://huggingface.co/jxm/cde-small-v1", + use_instructions=True, + adapted_from="nomic-ai/nomic-bert-2048", + superseded_by="jxm/cde-small-v2", + training_datasets=bge_full_data, + public_training_code="https://github.com/jxmorris12/cde", + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", +) + +cde_small_v2 = ModelMeta( + loader=None, # I will leave this at None for now, + name="jxm/cde-small-v2", + languages=["eng_Latn"], + open_weights=True, + revision="a7e5882ad52c27ea2831fc8258f24379c25cb459", + release_date="2025-01-13", + n_parameters=int(306 * 1e6), # Though the second-stage model is only 140M + max_tokens=512, + embed_dim=768, + license="mit", + similarity_fn_name="cosine", + framework=["Sentence Transformers"], + reference="https://huggingface.co/jxm/cde-small-v1", + use_instructions=True, + adapted_from="answerdotai/ModernBERT-base", + superseded_by="jxm/cde-small-v2", + training_datasets=bge_full_data, + public_training_code="https://github.com/jxmorris12/cde", + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", +) diff --git a/mteb/models/gme_models.py b/mteb/models/gme_models.py new file mode 100644 index 000000000..53476403b --- /dev/null +++ b/mteb/models/gme_models.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +import logging +from functools import partial + +from mteb.model_meta import ModelMeta + +logger = logging.getLogger(__name__) + + +gme_qwen2_vl_2b_instruct = ModelMeta( + loader=None, + name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", + languages=["eng_Latn"], + open_weights=True, + revision="cfeb66885b598de483cc04eb08c7d9da534d7afe", + release_date="2024-12-21", + n_parameters=int(2.21 * 1e9), + max_tokens=32768, + embed_dim=1536, + license="mit", + similarity_fn_name="cosine", + framework=["PyTorch"], + reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", + use_instructions=True, + adapted_from=None, + superseded_by=None, + training_datasets={ + # Only annotating text data for now + # source: https://arxiv.org/pdf/2412.16855 + "MSMARCO": ["train"], + "MSMARCO.v2": ["train"], + }, + public_training_code=None, + public_training_data=None, +) + +gme_qwen2_vl_7b_instruct = ModelMeta( + loader=None, + name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", + languages=["eng_Latn"], + open_weights=True, + revision="d42eca5a540526cfa982a349724b24b25c12a95e", + release_date="2024-12-21", + n_parameters=int(8.29 * 1e9), + max_tokens=32768, + embed_dim=3584, + license="mit", + similarity_fn_name="cosine", + framework=["PyTorch"], + reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct", + use_instructions=True, + adapted_from=None, + superseded_by=None, + training_datasets={ + # Only annotating text data for now + # source: https://arxiv.org/pdf/2412.16855 + "MSMARCO": ["train"], + "MSMARCO.v2": ["train"], + }, + public_training_code=None, + public_training_data=None, +) diff --git a/mteb/models/lens_models.py b/mteb/models/lens_models.py new file mode 100644 index 000000000..6e25e7dbf --- /dev/null +++ b/mteb/models/lens_models.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from functools import partial + +import torch + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta, sentence_transformers_loader +from mteb.models.instruct_wrapper import instruct_wrapper + +lens_d4000 = ModelMeta( + loader=None, # TODO: implement this in the future + name="yibinlei/LENS-d4000", + languages=None, + open_weights=True, + revision="e473b33364e6c48a324796fd1411d3b93670c6fe", + release_date="2025-01-17", + n_parameters=int(7.11 * 1e9), + embed_dim=4000, + license="apache-2.0", + reference="https://huggingface.co/yibinlei/LENS-d4000", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=None, + max_tokens=32768, +) + +lens_d8000 = ModelMeta( + loader=None, # TODO: implement this in the future + name="yibinlei/LENS-d8000", + languages=None, + open_weights=True, + revision="a0b87bd91cb27b6f2f0b0fe22c28026da1d464ef", + release_date="2025-01-17", + n_parameters=int(7.11 * 1e9), + embed_dim=8000, + license="apache-2.0", + reference="https://huggingface.co/yibinlei/LENS-d8000", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=None, + max_tokens=32768, +) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index ba12b0bb5..140d8bac7 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -1737,3 +1737,41 @@ training_datasets=None, # They "scraped" things from the internet, we don't know, could be leakage superseded_by=None, ) +ember_v1 = ModelMeta( + name="llmrails/ember-v1", + revision="5e5ce5904901f6ce1c353a95020f17f09e5d021d", + release_date="2023-10-10", + languages=["eng_Latn"], + n_parameters=335 * 1e6, + max_tokens=512, + embed_dim=1024, + license="mit", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/llmrails/ember-v1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + superseded_by=None, +) +amazon_titan_text_embeddings_v2 = ModelMeta( + name="amazon/Titan-text-embeddings-v2", + revision="1", + release_date="2024-04-30", + languages=["eng_Latn"], + n_parameters=None, + max_tokens=None, + embed_dim=None, + license="proprietary", + open_weights=False, + public_training_code=None, + public_training_data=None, + framework=[], + reference="https://huggingface.co/amazon/Titan-text-embeddings-v2", + similarity_fn_name="cosine", + use_instructions=False, + training_datasets=None, + superseded_by=None, +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index e444b1105..ccc8fbdda 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -15,10 +15,12 @@ arctic_models, bge_models, bm25, + cde_models, cohere_models, colbert_models, e5_instruct, e5_models, + gme_models, google_models, gritlm_models, gte_models, @@ -26,6 +28,7 @@ inf_models, jasper_models, jina_models, + lens_models, linq_models, llm2vec_models, misc_models, @@ -56,6 +59,7 @@ arctic_models, bge_models, bm25, + cde_models, cohere_models, colbert_models, e5_instruct, @@ -64,9 +68,11 @@ google_models, gritlm_models, gte_models, + gme_models, ibm_granite_models, inf_models, jina_models, + lens_models, linq_models, llm2vec_models, mxbai_models, From f645183b26c7226a5ccf6ad0dea201d4a1a8f8b4 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 22 Jan 2025 11:50:13 +0000 Subject: [PATCH 46/49] 1.29.15 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 18b551f19..9f6c5e571 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.14" +version = "1.29.15" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From e77543694ae16716c4420dd0b79c0d9f33a938db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 22 Jan 2025 13:04:47 +0100 Subject: [PATCH 47/49] fix: Added correct training data annotation to LENS (#1859) Added correct training data annotation to LENS --- mteb/models/lens_models.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mteb/models/lens_models.py b/mteb/models/lens_models.py index 6e25e7dbf..2fe54b26a 100644 --- a/mteb/models/lens_models.py +++ b/mteb/models/lens_models.py @@ -8,6 +8,8 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader from mteb.models.instruct_wrapper import instruct_wrapper +from .bge_models import bge_full_data + lens_d4000 = ModelMeta( loader=None, # TODO: implement this in the future name="yibinlei/LENS-d4000", @@ -23,8 +25,8 @@ framework=["PyTorch"], use_instructions=True, public_training_code=None, - public_training_data=None, - training_datasets=None, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets=bge_full_data, max_tokens=32768, ) @@ -43,7 +45,7 @@ framework=["PyTorch"], use_instructions=True, public_training_code=None, - public_training_data=None, - training_datasets=None, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets=bge_full_data, max_tokens=32768, ) From fa5127a6ff045cb5386c77da0b7e92d1386bbbec Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 22 Jan 2025 12:11:01 +0000 Subject: [PATCH 48/49] 1.29.16 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9f6c5e571..70ad64bd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.15" +version = "1.29.16" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From f1d418c88df9ce0d2d46f6a536c3133e49aa4907 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Fri, 24 Jan 2025 19:50:19 +0300 Subject: [PATCH 49/49] [v2] Update v2 again (#1864) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: Add reported annotation and re-added public_training_data (#1846) * fix: Add additional dataset annotations * fix: readded public training data * update voyage annotations * 1.29.11 Automatically generated by python-semantic-release * fix: Leaderboard Refinements (#1849) * Added better descriptions to benchmarks and removed beta tags * Fixed zero-shot filtering on app loading * Added zero-shot definition in an accordion * NaN values are now filled with blank * Added type hints to filter_models * 1.29.12 Automatically generated by python-semantic-release * fix: Fixed leaderboard search bar (#1852) Fixed leaderboard search bar * 1.29.13 Automatically generated by python-semantic-release * fix: Hotfixed public_training_data type annotation (#1857) Fixed public_training_data flag type to include boolean, as this is how all models are annotated * fix: Fix zeta alpha mistral (#1736) * fix zeta alpha mistral * update use_instructions * update training datasets * Update mteb/models/e5_instruct.py Co-authored-by: Kenneth Enevoldsen * update float * Update mteb/models/e5_instruct.py --------- Co-authored-by: Kenneth Enevoldsen * Add more annotations (#1833) * apply additions from #1794 * add annotations for rumodels * add nomic training data * fix metadata * update rest of model meta * fix bge reranker * 1.29.14 Automatically generated by python-semantic-release * fix: Adding missing model meta (#1856) * Added CDE models * Added bge-en-icl * Updated CDE to bge_full_data * Fixed public_training_data flag type to include boolean, as this is how all models are annotated * Added public training data link instead of bool to CDE and BGE * Added GME models * Changed Torch to PyTorch * Added metadata on LENS models * Added ember_v1 * Added metadata for amazon titan * Removed GME implementation * 1.29.15 Automatically generated by python-semantic-release * fix: Added correct training data annotation to LENS (#1859) Added correct training data annotation to LENS * 1.29.16 Automatically generated by python-semantic-release * lint * fix meta * fix meta * fix empty model meta * lint --------- Co-authored-by: Kenneth Enevoldsen Co-authored-by: github-actions Co-authored-by: Márton Kardos --- mteb/benchmarks/benchmarks.py | 69 +++++-- mteb/leaderboard/app.py | 154 ++++++++------ mteb/leaderboard/table.py | 13 +- mteb/model_meta.py | 4 +- mteb/models/arctic_models.py | 23 ++- mteb/models/bge_models.py | 211 +++++++++++--------- mteb/models/bm25.py | 1 + mteb/models/cde_models.py | 54 +++++ mteb/models/cohere_models.py | 12 +- mteb/models/colbert_models.py | 12 +- mteb/models/e5_instruct.py | 87 ++++++++ mteb/models/e5_models.py | 25 ++- mteb/models/gme_models.py | 62 ++++++ mteb/models/google_models.py | 9 +- mteb/models/gritlm_models.py | 18 +- mteb/models/gte_models.py | 9 +- mteb/models/ibm_granite_models.py | 71 ++++++- mteb/models/inf_models.py | 1 + mteb/models/jasper_models.py | 1 + mteb/models/jina_models.py | 23 ++- mteb/models/lens_models.py | 45 +++++ mteb/models/linq_models.py | 1 + mteb/models/llm2vec_models.py | 8 + mteb/models/misc_models.py | 131 ++++++++++-- mteb/models/model2vec_models.py | 9 +- mteb/models/moka_models.py | 9 +- mteb/models/mxbai_models.py | 1 + mteb/models/no_instruct_sentence_models.py | 1 + mteb/models/nomic_models.py | 101 +++++++++- mteb/models/nvidia_models.py | 2 + mteb/models/openai_models.py | 9 +- mteb/models/overview.py | 81 +++----- mteb/models/piccolo_models.py | 2 + mteb/models/promptriever_models.py | 4 + mteb/models/repllama_models.py | 2 + mteb/models/rerankers_custom.py | 6 +- mteb/models/rerankers_monot5_based.py | 14 ++ mteb/models/ru_sentence_models.py | 155 +++++++++++--- mteb/models/salesforce_models.py | 39 ++-- mteb/models/sentence_transformers_models.py | 44 +--- mteb/models/stella_models.py | 8 + mteb/models/text2vec_models.py | 9 +- mteb/models/uae_models.py | 1 + mteb/models/voyage_models.py | 104 +++++++++- pyproject.toml | 2 +- scripts/generate_metadata.py | 1 + tests/test_tasks/test_mteb_rerank.py | 1 + 47 files changed, 1264 insertions(+), 385 deletions(-) create mode 100644 mteb/models/cde_models.py create mode 100644 mteb/models/gme_models.py create mode 100644 mteb/models/lens_models.py diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 36641507a..50e2b45cc 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -71,7 +71,7 @@ def load_results( MTEB_EN = Benchmark( - name="MTEB(eng, beta)", + name="MTEB(eng)", tasks=MTEBTasks( get_tasks( tasks=[ @@ -128,7 +128,13 @@ def load_results( get_task("STS22.v2", eval_splits=["test"], hf_subsets=["en"]), ), ), - description="English benchmarks from MTEB", + description="""The new English Massive Text Embedding Benchmark. +This benchmark was created to account for the fact that many models have now been finetuned +to tasks in the original MTEB, and contains tasks that are not as frequently used for model training. +This way the new benchmark and leaderboard can give our users a more realistic expectation of models' generalization performance. + +The original MTEB leaderboard is available under the [MTEB(eng, classic)](http://mteb-leaderboard-2-demo.hf.space/?benchmark_name=MTEB%28eng%2C+classic%29) tab. + """, citation="", contacts=["KennethEnevoldsen", "Muennighoff"], ) @@ -216,7 +222,12 @@ def load_results( get_task("STS22", eval_splits=["test"], hf_subsets=["en"]), ) ), - description="The original English benchmark by Muennighoff et al., (2023).", + description="""The original English benchmark by Muennighoff et al., (2023). +This page is an adaptation of the [old MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard). + +> We recommend that you use [MTEB(eng)](http://mteb-leaderboard-2-demo.hf.space/?benchmark_name=MTEB%28eng%29) instead, +as many models have been tuned on MTEB(eng, classic) datasets, and MTEB(eng) might give a more accurate representation of models' generalization performance. + """, citation="""@inproceedings{muennighoff-etal-2023-mteb, title = "{MTEB}: Massive Text Embedding Benchmark", author = "Muennighoff, Niklas and @@ -275,7 +286,7 @@ def load_results( "STS22", ], ), - description="Main Russian benchmarks from MTEB", + description="A Russian version of the Massive Text Embedding Benchmark with a number of novel Russian tasks in all task categories of the original MTEB.", reference="https://aclanthology.org/2023.eacl-main.148/", citation="""@misc{snegirev2024russianfocusedembeddersexplorationrumteb, title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design}, @@ -324,8 +335,8 @@ def load_results( "LegalQuAD", ] ), - description="Legal benchmarks from MTEB.", - reference="https://aclanthology.org/2023.eacl-main.148/", + description="A benchmark of retrieval tasks in the legal domain.", + reference=None, citation=None, ) @@ -365,7 +376,10 @@ def load_results( "Tatoeba", ] ), - description="BitextMining benchmark from MINERS", + description="""Bitext Mining texts from the MINERS benchmark, a benchmark designed to evaluate the + ability of multilingual LMs in semantic retrieval tasks, + including bitext mining and classification via retrieval-augmented contexts. + """, reference="https://arxiv.org/pdf/2406.07424", citation=""" @article{winata2024miners, @@ -533,7 +547,7 @@ def load_results( ) + (get_task("STS22", eval_splits=["test"], hf_subsets=["fr"]),) ), - description="Main French benchmarks from MTEB", + description="MTEB-French, a French expansion of the original benchmark with high-quality native French datasets.", reference="https://arxiv.org/abs/2405.20468", citation="""@misc{ciancone2024mtebfrenchresourcesfrenchsentence, title={MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis}, @@ -581,7 +595,7 @@ def load_results( "STS22", ], ), - description="Main German benchmarks from MTEB", + description="A benchmark for text-embedding performance in German.", reference="https://arxiv.org/html/2401.02709v1", citation="""@misc{wehrli2024germantextembeddingclustering, title={German Text Embedding Clustering Benchmark}, @@ -613,7 +627,7 @@ def load_results( "KorSTS", ], ), - description="Main Korean benchmarks from MTEB", + description="A benchmark and leaderboard for evaluation of text embedding in Korean.", reference=None, citation=None, ) @@ -650,7 +664,11 @@ def load_results( ) + (get_task("STS22", eval_splits=["test"], hf_subsets=["pl"]),), ), - description="Main Polish benchmarks from MTEB", + description="""Polish Massive Text Embedding Benchmark (PL-MTEB), a comprehensive benchmark for text embeddings in Polish. The PL-MTEB consists of 28 diverse NLP +tasks from 5 task types. With tasks adapted based on previously used datasets by the Polish +NLP community. In addition, a new PLSC (Polish Library of Science Corpus) dataset was created +consisting of titles and abstracts of scientific publications in Polish, which was used as the basis for +two novel clustering tasks.""", # Rephrased from the abstract reference="https://arxiv.org/abs/2405.10138", citation="""@article{poswiata2024plmteb, title={PL-MTEB: Polish Massive Text Embedding Benchmark}, @@ -695,14 +713,14 @@ def load_results( "typescript", ], ), - description="Main code benchmarks from MTEB", + description="A massive code embedding benchmark covering retrieval tasks in a miriad of popular programming languages.", reference=None, citation=None, ) MTEB_multilingual = Benchmark( - name="MTEB(Multilingual, beta)", + name="MTEB(Multilingual)", tasks=get_tasks( tasks=[ "BornholmBitextMining", @@ -840,7 +858,7 @@ def load_results( "MIRACLRetrievalHardNegatives", ], ), - description="The Multilingual benchmarks from MMTEB. Currently under development.", + description="A large-scale multilingual expansion of MTEB, driven mainly by highly-curated community contributions covering 250+ languages.", reference=None, citation=None, contacts=["KennethEnevoldsen", "isaac-chung"], @@ -875,7 +893,7 @@ def load_results( "ESCIReranking", ], ), - description="Main Japanese benchmarks from MTEB", + description="JMTEB is a benchmark for evaluating Japanese text embedding models.", reference="https://github.com/sbintuitions/JMTEB", citation=None, ) @@ -915,7 +933,7 @@ def load_results( ] MTEB_INDIC = Benchmark( - name="MTEB(Indic, beta)", + name="MTEB(Indic)", tasks=get_tasks( tasks=[ # Bitext @@ -952,7 +970,7 @@ def load_results( languages=indic_languages, exclusive_language_filter=True, ), - description="Main Indic benchmark from MMTEB", + description="A regional geopolitical text embedding benchmark targetting embedding performance on Indic languages.", reference=None, citation=None, contacts=["KennethEnevoldsen", "isaac-chung"], @@ -1003,7 +1021,7 @@ def load_results( ] MTEB_EU = Benchmark( - name="MTEB(Europe, beta)", + name="MTEB(Europe)", tasks=get_tasks( tasks=[ "BornholmBitextMining", @@ -1084,7 +1102,7 @@ def load_results( languages=eu_languages, exclusive_language_filter=True, ), - description="Main European benchmark from MMTEB", + description="A regional geopolitical text embedding benchmark targetting embedding performance on European languages.", reference=None, citation=None, contacts=["KennethEnevoldsen", "isaac-chung"], @@ -1102,7 +1120,10 @@ def load_results( "LEMBWikimQARetrieval", ], ), - description="The main benchmark for evaluating long document retrieval.", + description="""LongEmbed is a benchmark oriented at exploring models' performance on long-context retrieval. + The benchmark comprises two synthetic tasks and four carefully chosen real-world tasks, + featuring documents of varying length and dispersed target information. + """, # Pieced together from paper abstract. reference="https://arxiv.org/abs/2404.12096v2", citation="""@article{zhu2024longembed, title={LongEmbed: Extending Embedding Models for Long Context Retrieval}, @@ -1117,7 +1138,13 @@ def load_results( tasks=get_tasks( tasks=["BrightRetrieval"], ), - description="A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.", + description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval. + BRIGHT is the first text retrieval + benchmark that requires intensive reasoning to retrieve relevant documents with + a dataset consisting of 1,384 real-world queries spanning diverse domains, such as + economics, psychology, mathematics, and coding. These queries are drawn from + naturally occurring and carefully curated human data. + """, reference="https://brightbenchmark.github.io/", citation="""@article{su2024bright, title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index cb806e467..5ee5a6b9d 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -6,6 +6,7 @@ import tempfile import time from pathlib import Path +from typing import Literal from urllib.parse import urlencode import gradio as gr @@ -48,9 +49,12 @@ def produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str: return md +DEFAULT_BENCHMARK_NAME = "MTEB(Multilingual)" + + def set_benchmark_on_load(request: gr.Request): query_params = request.query_params - return query_params.get("benchmark_name", "MTEB(Multilingual, beta)") + return query_params.get("benchmark_name", DEFAULT_BENCHMARK_NAME) def download_table(table: pd.DataFrame) -> Path: @@ -117,23 +121,75 @@ def update_task_info(task_names: str) -> gr.DataFrame: return gr.DataFrame(df, datatype=["markdown"] + ["str"] * (len(df.columns) - 1)) +# Model sizes in million parameters +MIN_MODEL_SIZE, MAX_MODEL_SIZE = 0, 10_000 + + +def filter_models( + model_names, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot_setting, +): + lower, upper = model_size + # Setting to None, when the user doesn't specify anything + if (lower == MIN_MODEL_SIZE) and (upper == MAX_MODEL_SIZE): + lower, upper = None, None + else: + # Multiplying by millions + lower = lower * 1e6 + upper = upper * 1e6 + model_metas = mteb.get_model_metas( + model_names=model_names, + open_weights=availability, + use_instructions=instructions, + frameworks=compatibility, + n_parameters_range=(lower, upper), + ) + tasks = mteb.get_tasks(tasks=task_select) + models_to_keep = set() + for model_meta in model_metas: + is_model_zero_shot = model_meta.is_zero_shot_on(tasks) + if is_model_zero_shot is None: + if zero_shot_setting == "hard": + continue + elif not is_model_zero_shot: + if zero_shot_setting != "off": + continue + models_to_keep.add(model_meta.name) + return list(models_to_keep) + + logger.info("Loading all benchmark results") all_results = load_results() -# Model sizes in million parameters -min_model_size, max_model_size = 0, 10_000 - benchmarks = mteb.get_benchmarks() all_benchmark_results = { benchmark.name: benchmark.load_results(base_results=all_results) for benchmark in benchmarks } -default_benchmark = mteb.get_benchmark("MTEB(Multilingual, beta)") +default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME) default_results = all_benchmark_results[default_benchmark.name] logger.info("Benchmark results loaded") default_scores = default_results.get_scores(format="long") -summary_table, per_task_table = scores_to_tables(default_scores) +all_models = list({entry["model_name"] for entry in default_scores}) +filtered_models = filter_models( + all_models, + default_results.task_names, + availability=None, + compatibility=[], + instructions=None, + model_size=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), + zero_shot_setting="soft", +) + +summary_table, per_task_table = scores_to_tables( + [entry for entry in default_scores if entry["model_name"] in filtered_models] +) benchmark_select = gr.Dropdown( [bench.name for bench in benchmarks], @@ -207,7 +263,7 @@ def update_task_info(task_names: str) -> gr.DataFrame: with gr.Row(): searchbar = gr.Textbox( label="Search Models", - info="Search models by name (RegEx sensitive. Separate queries with `|`)", + info="Press Enter to search.\nSearch models by name (RegEx sensitive. Separate queries with `|`)", interactive=True, ) compatibility = gr.CheckboxGroup( @@ -258,14 +314,14 @@ def update_task_info(task_names: str) -> gr.DataFrame: interactive=True, ) model_size = RangeSlider( - minimum=min_model_size, - maximum=max_model_size, - value=(min_model_size, max_model_size), + minimum=MIN_MODEL_SIZE, + maximum=MAX_MODEL_SIZE, + value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), label="Model Size (#M Parameters)", interactive=True, ) scores = gr.State(default_scores) - models = gr.State(list({entry["model_name"] for entry in default_scores})) + models = gr.State(filtered_models) with gr.Row(): with gr.Column(): description = gr.Markdown( @@ -295,6 +351,10 @@ def update_task_info(task_names: str) -> gr.DataFrame: """ ) summary_table.render() + download_summary = gr.DownloadButton("Download Table") + download_summary.click( + download_table, inputs=[summary_table], outputs=[download_summary] + ) with gr.Accordion( "What do aggregate measures (Rank(Borda), Mean(Task), etc.) mean?", open=False, @@ -308,10 +368,19 @@ def update_task_info(task_names: str) -> gr.DataFrame: **Mean(TaskType)**: This is a weighted average across different task categories, such as classification or retrieval. It is computed by first computing the average by task category and then computing the average on each category. Similar to the Mean(Task) this measure is continuous and tends to overvalue tasks with higher variance. This score also prefers models that perform well across all task categories. """ ) - download_summary = gr.DownloadButton("Download Table") - download_summary.click( - download_table, inputs=[summary_table], outputs=[download_summary] - ) + with gr.Accordion( + "What does zero-shot mean?", + open=False, + ): + gr.Markdown( + """ +A model is considered zero-shot if it is not trained on any splits of the datasets used to derive the tasks. +E.g., if a model is trained on Natural Questions, it cannot be considered zero-shot on benchmarks containing the task “NQ” which is derived from Natural Questions. +This definition creates a few edge cases. For instance, multiple models are typically trained on Wikipedia title and body pairs, but we do not define this as leakage on, e.g., “WikipediaRetrievalMultilingual” and “WikiClusteringP2P” as these datasets are not based on title-body pairs. +Distilled, further fine-tunes or in other ways, derivative models inherit the datasets of their parent models. +Based on community feedback and research findings, This definition could change in the future. + """ + ) with gr.Tab("Performance per task"): per_task_table.render() download_per_task = gr.DownloadButton("Download Table") @@ -405,51 +474,14 @@ def update_task_list(benchmark_name, type_select, domain_select, lang_select): outputs=[task_select], ) - def filter_models( - model_names, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot_setting, - ): - lower, upper = model_size - # Setting to None, when the user doesn't specify anything - if (lower == min_model_size) and (upper == max_model_size): - lower, upper = None, None - else: - # Multiplying by millions - lower = lower * 1e6 - upper = upper * 1e6 - model_metas = mteb.get_model_metas( - model_names=model_names, - open_weights=availability, - use_instructions=instructions, - frameworks=compatibility, - n_parameters_range=(lower, upper), - ) - tasks = mteb.get_tasks(tasks=task_select) - models_to_keep = set() - for model_meta in model_metas: - is_model_zero_shot = model_meta.is_zero_shot_on(tasks) - if is_model_zero_shot is None: - if zero_shot_setting == "hard": - continue - elif not is_model_zero_shot: - if zero_shot_setting != "off": - continue - models_to_keep.add(model_meta.name) - return list(models_to_keep) - def update_models( - scores, - tasks, - availability, - compatibility, - instructions, - model_size, - zero_shot, + scores: list[dict], + tasks: list[str], + availability: bool | None, + compatibility: list[str], + instructions: bool | None, + model_size: tuple[int, int], + zero_shot: Literal["hard", "soft", "off"], ): start_time = time.time() model_names = list({entry["model_name"] for entry in scores}) @@ -544,7 +576,7 @@ def update_models( ], outputs=[models], ) - zero_shot.input( + zero_shot.change( update_models, inputs=[ scores, @@ -594,7 +626,7 @@ def update_tables( inputs=[scores, searchbar, task_select, models], outputs=[summary_table, per_task_table], ) - searchbar.input( + searchbar.submit( update_tables, inputs=[scores, searchbar, task_select, models], outputs=[summary_table, per_task_table], diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 041df4709..ef28392cf 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -142,6 +142,11 @@ def scores_to_tables( names = per_task.index.get_level_values("model_name") names = pd.Series(names, index=per_task.index) to_remove |= ~names.str.contains(search_query, regex=True) + if to_remove.all(): + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return gr.DataFrame(no_results_frame), gr.DataFrame(no_results_frame) models_to_remove = list(per_task[to_remove].index) typed_mean = mean_per_type.mean(skipna=False, axis=1) overall_mean = per_task.mean(skipna=False, axis=1) @@ -218,7 +223,11 @@ def scores_to_tables( joint_table[score_columns] = joint_table[score_columns].map(format_scores) joint_table_style = ( joint_table.style.format( - {**{column: "{:.2f}" for column in score_columns}, "Rank (Borda)": "{:.0f}"} + { + **{column: "{:.2f}" for column in score_columns}, + "Rank (Borda)": "{:.0f}", + }, + na_rep="", ) .highlight_min("Rank (Borda)", props="font-weight: bold") .highlight_max(subset=score_columns, props="font-weight: bold") @@ -226,7 +235,7 @@ def scores_to_tables( task_score_columns = per_task.select_dtypes("number").columns per_task[task_score_columns] *= 100 per_task_style = per_task.style.format( - "{:.2f}", subset=task_score_columns + "{:.2f}", subset=task_score_columns, na_rep="" ).highlight_max(subset=task_score_columns, props="font-weight: bold") return ( gr.DataFrame( diff --git a/mteb/model_meta.py b/mteb/model_meta.py index eed74c5b4..b0dbccf24 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -68,7 +68,8 @@ class ModelMeta(BaseModel): release_date: The date the model's revision was released. license: The license under which the model is released. Required if open_weights is True. open_weights: Whether the model is open source or proprietary. - public_training_code: Whether the code used to train the model is publicly available. + public_training_code: A link to the publicly available training code. If none it is assumed that the training code is not publicly available. + public_training_data: A link to the publicly available training data. If none it is assumed that the training data is not publicly available. similarity_fn_name: The distance metric used by the model. framework: The framework the model is implemented in, can be a list of frameworks e.g. `["Sentence Transformers", "PyTorch"]`. reference: A URL to the model's page on huggingface or another source. @@ -97,6 +98,7 @@ class ModelMeta(BaseModel): license: str | None open_weights: bool | None public_training_code: str | None + public_training_data: str | bool | None framework: list[FRAMEWORKS] reference: STR_URL | None = None similarity_fn_name: DISTANCE_METRICS | None diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index b7217d1ef..dd3cd1c8d 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -116,6 +116,7 @@ url={https://arxiv.org/abs/2407.18887}, }""", public_training_code=None, + public_training_data=None, training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -164,7 +165,8 @@ use_instructions=True, adapted_from="sentence-transformers/all-MiniLM-L6-v2", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -213,7 +215,8 @@ use_instructions=True, adapted_from="intfloat/e5-small-unsupervised", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -262,7 +265,8 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v1.5", - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -311,7 +315,8 @@ use_instructions=True, adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -360,7 +365,8 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0", - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -412,6 +418,7 @@ adapted_from=None, superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -437,7 +444,8 @@ use_instructions=True, adapted_from="Alibaba-NLP/gte-multilingual-base", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -485,7 +493,8 @@ use_instructions=True, adapted_from="BAAI/bge-m3-retromae", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 79d220588..91ff256bb 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -4,6 +4,8 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader +from .e5_instruct import E5_MISTRAL_TRAINING_DATA + model_prompts = {"query": "Represent this sentence for searching relevant passages: "} BGE_15_CITATION = """@misc{bge_embedding, title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, @@ -15,8 +17,8 @@ }""" model_prompts_zh = {"query": "为这个句子生成表示以用于检索相关文章:"} -bge_m_training_data = { - # source: https://arxiv.org/pdf/2402.03216 +bge_m3_training_data = { + # source: https://arxiv.org/abs/2402.03216 "MIRACLRetrieval": ["train"], "MIRACLRetrievalHardNegatives": ["train"], "MIRACLReranking": ["train"], @@ -36,6 +38,28 @@ "HotpotQA": ["train"], "HotpotQA-PL": ["train"], # translation not trained on "HotpotQAHardNegatives": ["train"], + "T2Retrieval": ["train"], + "DuReader": ["train"], + "MMarcoReranking": ["train"], + "CodeSearchNet": ["train"], + # not in mteb + # "s2orc" + # Wikipedia + # "xP3" + # "mC4" + # "CC-News" + # "MTP" + # "NLLB" + # "CCMatrix" + # TriviaQA + # COL-IEE + # PubMedQA + # SQuAD + # SimCSE + # mMARCO-ZH + # LawGPT + # NLI-zh2, LeCaRDv2, + # NLI, MultiLongDoc (their syntetic) # + synthetic data } @@ -97,38 +121,6 @@ # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) } -bgem3_training_data = { - # source https://arxiv.org/abs/2402.03216 - "T2Retrieval": ["train"], - "DuReader": ["train"], - "MMarcoReranking": ["train"], - "CMedQAv2-reranking": ["train"], - "HotpotQA": ["train"], - "NQ": ["train"], - "MSMARCO": ["train"], - "MrTidyRetrieval": ["train"], - "MIRACLRetrieval": ["train"], - "CodeSearchNet": ["train"], - # not in mteb - # "s2orc" - # Wikipedia - # "xP3" - # "mC4" - # "CC-News" - # "MTP" - # "NLLB" - # "CCMatrix" - # TriviaQA - # COL-IEE - # PubMedQA - # SQuAD - # SimCSE - # mMARCO-ZH - # LawGPT - # NLI-zh2, LeCaRDv2, - # NLI, MultiLongDoc (their syntetic) -} - # https://huggingface.co/BAAI/bge-m3/discussions/29 bgem3_languages = [ "afr_Latn", # af @@ -306,59 +298,6 @@ "zho_Hans", # zh ] -bge_m_training_data = { - # source: https://arxiv.org/pdf/2402.03216 - "MIRACLRetrieval": ["train"], - "MIRACLRetrievalHardNegatives": ["train"], - "MIRACLReranking": ["train"], - "LeCaRDv2": ["train"], - "CMedQAv1-reranking": ["train"], - "CMedQAv2-reranking": ["train"], - "MrTidyRetrieval": ["train"], - "T2Reranking": ["train"], - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - "HotpotQA": ["train"], - "HotpotQA-PL": ["train"], # translation not trained on - "HotpotQAHardNegatives": ["train"], - # + synthetic data -} - -bge_training_data = { - # source: https://data.baai.ac.cn/details/BAAI-MTP - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], # assumed from: amazon_reviews_multi - "MLQARetrieval": [ - "validation", - "test", - ], # assumed from mlqa (question, context) - # not in mteb - # Dataset Pairs - # wudao (title, passage) - # cmrc2018 (query, context) - # dureader (query, context) - # simclue (sentence_a, sentence_b) - # csl (title, abstract) - # amazon_reviews_multi (title, body) - # wiki_atomic_edits (base_sentence, edited_sentence) - # mlqa (question, context) - # xlsum (title, summary) (title, text) - # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further - # "wikipedia": [], # title + section title, passage - # "reddit": [], # title, body - # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) - # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) -} bge_small_en_v1_5 = ModelMeta( loader=partial( # type: ignore @@ -380,7 +319,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data="https://data.baai.ac.cn/details/BAAI-MTP", training_datasets=bge_training_data, citation=BGE_15_CITATION, ) @@ -406,6 +346,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_data="https://data.baai.ac.cn/details/BAAI-MTP", training_datasets=bge_training_data, citation=BGE_15_CITATION, ) @@ -432,6 +373,7 @@ use_instructions=True, citation=BGE_15_CITATION, public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_data="https://data.baai.ac.cn/details/BAAI-MTP", training_datasets=bge_training_data, ) @@ -455,7 +397,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data=None, training_datasets=bge_chinese_training_data, ) @@ -479,7 +422,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data=None, training_datasets=bge_chinese_training_data, ) @@ -503,7 +447,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data=None, training_datasets=bge_chinese_training_data, ) @@ -527,7 +472,8 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, - training_datasets=bgem3_training_data, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets=bge_m3_training_data, ) @@ -560,5 +506,86 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # not disclosed ) + +# Contents of cfli/bge-full-data +bge_full_data = { + # source: https://arxiv.org/pdf/2409.15700 + # Charles Goodhart is turning back and forth + # in his grave as I'm annotating this + # |Retrieval| + # ELI5 + # SQuaD + # TriviaQA + # QuoraDuplicateQuestions + "HotpotQA": ["train"], + "FEVER": ["train"], + "MSMARCO": ["train"], + "NQ": ["train"], + "ArguAna": ["train"], + "FiQA2018": ["train"], + # |Reranking| + "SciDocsReranking": ["train"], + "StackOverflowDupQuestions": ["train"], + # |Classification| + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ImdbClassification": ["train"], + "ToxicConversationsClassification": ["train"], + # |Clustering| + "ArxivClusteringS2S": ["train"], + "ArxivClusteringP2P": ["train"], + "BiorxivClusteringS2S": ["train"], + "BiorxivClusteringP2P": ["train"], + "MedrxivClusteringS2S": ["train"], + "MedrxivClusteringP2P": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "RedditClusteringP2P": ["train"], + "RedditClustering": ["train"], + "RedditClustering.v2": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwentyNewsgroupsClustering.v2": ["train"], + # |STS| + "STS22": ["train"], + "STS22.v2": ["train"], + "STSBenchmark": ["train"], +} + +bge_en_icl = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="BAAI/bge-en-icl", + revision="971c7e1445cc86656ca0bd85ed770b8675a40bb5", + ), + name="BAAI/bge-en-icl", + languages=[ + "eng_Latn", + ], + open_weights=True, + revision="971c7e1445cc86656ca0bd85ed770b8675a40bb5", + release_date="2024-07-25", # initial commit of hf model. + n_parameters=7.11 * 1e9, + embed_dim=4096, + license="apache-2", + max_tokens=32768, + reference="https://huggingface.co/BAAI/bge-en-icl", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code="https://github.com/FlagOpen/FlagEmbedding", + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets={ + **E5_MISTRAL_TRAINING_DATA, + **bge_full_data, + }, + adapted_from="intfloat/e5-mistral-7b-instruct", +) diff --git a/mteb/models/bm25.py b/mteb/models/bm25.py index ea56fd432..6e3d3747d 100644 --- a/mteb/models/bm25.py +++ b/mteb/models/bm25.py @@ -139,5 +139,6 @@ def encode(self, texts: list[str], **kwargs): framework=[], use_instructions=False, public_training_code="https://github.com/xhluca/bm25s", + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/cde_models.py b/mteb/models/cde_models.py new file mode 100644 index 000000000..78870ef12 --- /dev/null +++ b/mteb/models/cde_models.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import logging + +from mteb.model_meta import ModelMeta + +from .bge_models import bge_full_data + +logger = logging.getLogger(__name__) + + +cde_small_v1 = ModelMeta( + loader=None, # I will leave this at None for now, + name="jxm/cde-small-v1", + languages=["eng_Latn"], + open_weights=True, + revision="8d5736163718a8b65cd787b75ed61020d18bad3c", + release_date="2024-09-24", + n_parameters=int(281 * 1e6), # Though the second-stage model is only 140M + max_tokens=512, + embed_dim=768, + license="mit", + similarity_fn_name="cosine", + framework=["Sentence Transformers"], + reference="https://huggingface.co/jxm/cde-small-v1", + use_instructions=True, + adapted_from="nomic-ai/nomic-bert-2048", + superseded_by="jxm/cde-small-v2", + training_datasets=bge_full_data, + public_training_code="https://github.com/jxmorris12/cde", + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", +) + +cde_small_v2 = ModelMeta( + loader=None, # I will leave this at None for now, + name="jxm/cde-small-v2", + languages=["eng_Latn"], + open_weights=True, + revision="a7e5882ad52c27ea2831fc8258f24379c25cb459", + release_date="2025-01-13", + n_parameters=int(306 * 1e6), # Though the second-stage model is only 140M + max_tokens=512, + embed_dim=768, + license="mit", + similarity_fn_name="cosine", + framework=["Sentence Transformers"], + reference="https://huggingface.co/jxm/cde-small-v1", + use_instructions=True, + adapted_from="answerdotai/ModernBERT-base", + superseded_by="jxm/cde-small-v2", + training_datasets=bge_full_data, + public_training_code="https://github.com/jxmorris12/cde", + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", +) diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py index 8718a2e2a..60ff63ee8 100644 --- a/mteb/models/cohere_models.py +++ b/mteb/models/cohere_models.py @@ -234,7 +234,8 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -257,7 +258,8 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -280,7 +282,8 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -303,6 +306,7 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index 0a8c0e4a5..6c2951085 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -156,6 +156,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: open_weights=True, revision="c1e84128e85ef755c096a95bdb06b47793b13acf", public_training_code=None, + public_training_data=None, release_date="2024-09-21", n_parameters=110 * 1e6, max_tokens=180, # Reduced for Benchmarking - see ColBERT paper @@ -167,7 +168,9 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: use_instructions=False, adapted_from=None, superseded_by=None, - training_datasets=None, + training_datasets={ + "MSMARCO": ["train"], # dev? + }, ) @@ -208,6 +211,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: open_weights=True, revision="4cf816e5e2b03167b132a3c847a9ecd48ba708e1", public_training_code=None, + public_training_data=None, release_date="2024-08-16", n_parameters=559 * 1e6, max_tokens=8192, @@ -219,5 +223,9 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: use_instructions=False, adapted_from=None, superseded_by=None, - training_datasets=None, + training_datasets={ + "MSMARCO": ["train"], + "DuRetrieval": [], + "MIRACL": ["train"], + }, ) diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index 1d457652a..58afc1797 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -15,6 +15,16 @@ E5_INSTRUCTION = "Instruct: {instruction}\nQuery: " +E5_MISTRAL_TRAINING_DATA = { + **E5_TRAINING_DATA, + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + e5_instruct = ModelMeta( loader=partial( # type: ignore instruct_wrapper, @@ -46,6 +56,7 @@ year={2024} }""", public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -91,5 +102,81 @@ } """, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) + +zeta_alpha_ai__Zeta_Alpha_E5_Mistral = ModelMeta( + loader=partial( # type: ignore + instruct_wrapper, + model_name_or_path="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + instruction_template=E5_INSTRUCTION, + attn="cccc", + pooling_method="lasttoken", + mode="embedding", + torch_dtype=torch.bfloat16, + # The ST script does not normalize while the HF one does so unclear what to do + # https://huggingface.co/intfloat/e5-mistral-7b-instruct#transformers + normalized=True, + ), + name="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + revision="c791d37474fa6a5c72eb3a2522be346bc21fbfc3", + release_date="2024-08-30", + languages=["eng_Latn"], + n_parameters=7110660096, + max_tokens=32768.0, + embed_dim=4096, + license="mit", + open_weights=True, + public_training_data=None, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + similarity_fn_name="cosine", + use_instructions=True, + training_datasets={ + # copied from e5 + # source: https://arxiv.org/pdf/2212.03533 + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], # dev? + # source: https://www.zeta-alpha.com/post/fine-tuning-an-llm-for-state-of-the-art-retrieval-zeta-alpha-s-top-10-submission-to-the-the-mteb-be + # "Arguana", + # "FEVER", + # "FIQA", + # "HotPotQA", + # "MsMarco (passage)", + # "NFCorpus", + # "SciFact", + # "NLI", + # "SQuad", + # "StackExchange", + # "TriviaQA", + # "SciRep", + # "SciRepEval" + # mteb + # https://huggingface.co/datasets/mteb/raw_arxiv + # "ArxivClusteringS2S": ["train"], + # "ArxivClusteringP2P": ["train"], + # https://huggingface.co/datasets/mteb/raw_biorxiv + # "BiorxivClusteringS2S": ["train"], + # "BiorxivClusteringP2P": ["train"], + # https://huggingface.co/datasets/mteb/raw_medrxiv + # "MedrxivClusteringS2S": ["train"], + # "MedrxivClusteringP2P": ["train"], + # as their train datasets + "AmazonCounterfactualClassification": ["train"], + "AmazonReviewsClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ImdbClassification": ["train"], + "STS12": ["train"], + "STS22": ["train"], + "STSBenchmark": ["train"], + }, + adapted_from="intfloat/e5-mistral-7b-instruct", + superseded_by=None, +) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index fe265f6f4..4c3c3d479 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -144,6 +144,16 @@ "NQ-PL": ["train"], # translation not trained on } +ME5_TRAINING_DATA = { + **E5_TRAINING_DATA, + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + e5_mult_small = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, @@ -166,7 +176,8 @@ use_instructions=True, citation=MULTILINGUAL_E5_CITATION, public_training_code=None, # couldn't find - training_datasets=E5_TRAINING_DATA, + training_datasets=ME5_TRAINING_DATA, + public_training_data=None, ) e5_mult_base = ModelMeta( @@ -189,7 +200,8 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, - training_datasets=E5_TRAINING_DATA, + public_training_data=None, + training_datasets=ME5_TRAINING_DATA, citation=MULTILINGUAL_E5_CITATION, ) @@ -214,7 +226,8 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, - training_datasets=E5_TRAINING_DATA, + public_training_data=None, + training_datasets=ME5_TRAINING_DATA, citation=MULTILINGUAL_E5_CITATION, ) @@ -238,6 +251,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, citation=E5_CITATION, ) @@ -263,6 +277,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, citation=E5_CITATION, ) @@ -291,6 +306,7 @@ adapted_from=None, citation=E5_CITATION, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -317,6 +333,7 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, citation=E5_CITATION, ) @@ -344,6 +361,7 @@ superseded_by="intfloat/e5-large-v2", adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, citation=E5_CITATION, ) @@ -371,6 +389,7 @@ superseded_by="intfloat/e5-base-v2", adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, citation=E5_CITATION, ) diff --git a/mteb/models/gme_models.py b/mteb/models/gme_models.py new file mode 100644 index 000000000..804dfbc84 --- /dev/null +++ b/mteb/models/gme_models.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +import logging + +from mteb.model_meta import ModelMeta + +logger = logging.getLogger(__name__) + + +gme_qwen2_vl_2b_instruct = ModelMeta( + loader=None, + name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", + languages=["eng_Latn"], + open_weights=True, + revision="cfeb66885b598de483cc04eb08c7d9da534d7afe", + release_date="2024-12-21", + n_parameters=int(2.21 * 1e9), + max_tokens=32768, + embed_dim=1536, + license="mit", + similarity_fn_name="cosine", + framework=["PyTorch"], + reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", + use_instructions=True, + adapted_from=None, + superseded_by=None, + training_datasets={ + # Only annotating text data for now + # source: https://arxiv.org/pdf/2412.16855 + "MSMARCO": ["train"], + "MSMARCO.v2": ["train"], + }, + public_training_code=None, + public_training_data=None, +) + +gme_qwen2_vl_7b_instruct = ModelMeta( + loader=None, + name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", + languages=["eng_Latn"], + open_weights=True, + revision="d42eca5a540526cfa982a349724b24b25c12a95e", + release_date="2024-12-21", + n_parameters=int(8.29 * 1e9), + max_tokens=32768, + embed_dim=3584, + license="mit", + similarity_fn_name="cosine", + framework=["PyTorch"], + reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct", + use_instructions=True, + adapted_from=None, + superseded_by=None, + training_datasets={ + # Only annotating text data for now + # source: https://arxiv.org/pdf/2412.16855 + "MSMARCO": ["train"], + "MSMARCO.v2": ["train"], + }, + public_training_code=None, + public_training_data=None, +) diff --git a/mteb/models/google_models.py b/mteb/models/google_models.py index 08065f7af..40d316fee 100644 --- a/mteb/models/google_models.py +++ b/mteb/models/google_models.py @@ -151,7 +151,8 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -173,7 +174,8 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -195,6 +197,7 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index ab32a6a9a..eb23ee66b 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -11,6 +11,18 @@ logger = logging.getLogger(__name__) +GRIT_LM_TRAINING_DATA = { + **E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + # also uses medi2 which contains fever and hotpotqa: + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + + def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: return ( "<|user|>\n" + instruction + "\n<|embed|>\n" if instruction else "<|embed|>\n" @@ -50,9 +62,10 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, - training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + training_datasets=GRIT_LM_TRAINING_DATA, # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data public_training_code="https://github.com/ContextualAI/gritlm", + public_training_data=None, citation=GRITLM_CITATION, ) gritlm8x7b = ModelMeta( @@ -76,8 +89,9 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, + training_datasets=GRIT_LM_TRAINING_DATA, citation=GRITLM_CITATION, - training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data public_training_code="https://github.com/ContextualAI/gritlm", + public_training_data=None, ) diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index f80dc01fd..fb3bb6db3 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -56,6 +56,7 @@ def instruction_template( use_instructions=True, citation=GTE_CITATION, public_training_code=None, + public_training_data=None, training_datasets=None, max_tokens=131072, ) @@ -87,6 +88,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -117,6 +119,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -140,6 +143,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # Not disclosed ) @@ -163,6 +167,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # Not disclosed ) @@ -186,6 +191,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # Not disclosed ) @@ -301,6 +307,7 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets=gte_multi_training_data, ) diff --git a/mteb/models/ibm_granite_models.py b/mteb/models/ibm_granite_models.py index 78bad6097..e7c3b8b02 100644 --- a/mteb/models/ibm_granite_models.py +++ b/mteb/models/ibm_granite_models.py @@ -20,6 +20,65 @@ "zho_Hans", ] +granite_training_data = { + # Multilingual MC4 + # Multilingual Webhose + # English Wikipedia + # Multilingual Wikimedia + "WikipediaRetrievalMultilingual": [], + "WikipediaRerankingMultilingual": [], + # Miracl Corpus (Title-Body) + # Stack Exchange Duplicate questions (titles) + # Stack Exchange Duplicate questions (titles) + # Stack Exchange Duplicate questions (bodies) + "StackOverflowDupQuestions": [], + "AskUbuntuDupQuestions": [], + # Stack Exchange (Title, Answer) pairs + # Stack Exchange (Title, Body) pairs + # Stack Exchange (Title, Body) pairs + # Machine Translations of Stack Exchange Duplicate questions (titles) + # Machine Translations of Stack Exchange (Title+Body, Answer) pairs + "StackExchangeClusteringP2P": [], + "StackExchangeClusteringP2P.v2": [], + "StackExchangeClustering": [], + "StackExchangeClustering.v2": [], + # SearchQA + # S2ORC (Title, Abstract) + # WikiAnswers Duplicate question pairs + # CCNews + # XSum + # SimpleWiki + # Machine Translated Cross Lingual Parallel Corpora + # SPECTER citation triplets + # Machine Translations of SPECTER citation triplets + # Natural Questions (NQ) + "NQ": ["test"], + "NQHardNegatives": ["test"], + # SQuAD2.0 + # HotpotQA + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + # Fever + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], + # PubMed + # Multilingual Miracl Triples + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + # Multilingual MrTydi Triples + "MrTidyRetrieval": ["train"], + # Sadeeem Question Asnwering + # DBPedia Title-Body Pairs + "DBPedia": ["train"], + # Synthetic: English Query-Wikipedia Passage + # Synthetic: English Fact Verification + # Synthetic: Multilingual Query-Wikipedia Passage + # Synthetic: Multilingual News Summaries + # IBM Internal Triples + # IBM Internal Title-Body Pairs +} granite_107m_multilingual = ModelMeta( loader=partial( # type: ignore @@ -42,8 +101,9 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) granite_278m_multilingual = ModelMeta( @@ -67,8 +127,9 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) granite_30m_english = ModelMeta( @@ -92,8 +153,9 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) granite_125m_english = ModelMeta( @@ -117,6 +179,7 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) diff --git a/mteb/models/inf_models.py b/mteb/models/inf_models.py index dc31adccd..0d40ff3ef 100644 --- a/mteb/models/inf_models.py +++ b/mteb/models/inf_models.py @@ -26,5 +26,6 @@ use_instructions=True, adapted_from="Alibaba-NLP/gte-Qwen2-7B-instruct", public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index 1dc06d564..dbd1615ad 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -93,4 +93,5 @@ def encode( training_datasets=nvidia_training_datasets, # "In jasper model the teacher model is nvidia/NV-Embed-v2", source https://huggingface.co/infgrad/jasper_en_vision_language_v1 # "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index f9b1f1b72..41742a2ee 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -222,8 +222,25 @@ def encode( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, reference="https://huggingface.co/jinaai/jina-embeddings-v3", - training_datasets=None, public_training_code=None, + public_training_data=None, + training_datasets={ + # CulturaX + "STS12": [], + # "SICK": [], + # "WMT19": [], + # "MADLAD-3B": [], + # NLI + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # oasst1, oasst2 + }, + adapted_from="XLM-RoBERTa", citation=""" @misc{sturua2024jinaembeddingsv3multilingualembeddingstask, title={jina-embeddings-v3: Multilingual Embeddings With Task LoRA}, @@ -256,6 +273,7 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) jina_embeddings_v2_small_en = ModelMeta( @@ -276,6 +294,7 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) jina_embedding_b_en_v1 = ModelMeta( @@ -296,6 +315,7 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) jina_embedding_s_en_v1 = ModelMeta( @@ -316,4 +336,5 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/lens_models.py b/mteb/models/lens_models.py new file mode 100644 index 000000000..380724e53 --- /dev/null +++ b/mteb/models/lens_models.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.model_meta import ModelMeta + +from .bge_models import bge_full_data + +lens_d4000 = ModelMeta( + loader=None, # TODO: implement this in the future + name="yibinlei/LENS-d4000", + languages=None, + open_weights=True, + revision="e473b33364e6c48a324796fd1411d3b93670c6fe", + release_date="2025-01-17", + n_parameters=int(7.11 * 1e9), + embed_dim=4000, + license="apache-2.0", + reference="https://huggingface.co/yibinlei/LENS-d4000", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets=bge_full_data, + max_tokens=32768, +) + +lens_d8000 = ModelMeta( + loader=None, # TODO: implement this in the future + name="yibinlei/LENS-d8000", + languages=None, + open_weights=True, + revision="a0b87bd91cb27b6f2f0b0fe22c28026da1d464ef", + release_date="2025-01-17", + n_parameters=int(7.11 * 1e9), + embed_dim=8000, + license="apache-2.0", + reference="https://huggingface.co/yibinlei/LENS-d8000", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets=bge_full_data, + max_tokens=32768, +) diff --git a/mteb/models/linq_models.py b/mteb/models/linq_models.py index 11cfa74ed..ead10ebf7 100644 --- a/mteb/models/linq_models.py +++ b/mteb/models/linq_models.py @@ -40,5 +40,6 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/llm2vec_models.py b/mteb/models/llm2vec_models.py index 708353475..82186309d 100644 --- a/mteb/models/llm2vec_models.py +++ b/mteb/models/llm2vec_models.py @@ -138,6 +138,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, citation=LLM2VEC_CITATION, ) @@ -165,6 +166,7 @@ def loader_inner(**kwargs: Any) -> Encoder: citation=LLM2VEC_CITATION, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, ) @@ -192,6 +194,7 @@ def loader_inner(**kwargs: Any) -> Encoder: citation=LLM2VEC_CITATION, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_mistral7b_unsupervised = ModelMeta( @@ -218,6 +221,7 @@ def loader_inner(**kwargs: Any) -> Encoder: citation=LLM2VEC_CITATION, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, ) llm2vec_llama2_7b_supervised = ModelMeta( @@ -244,6 +248,7 @@ def loader_inner(**kwargs: Any) -> Encoder: citation=LLM2VEC_CITATION, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_llama2_7b_unsupervised = ModelMeta( @@ -269,6 +274,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, citation=LLM2VEC_CITATION, ) @@ -296,6 +302,7 @@ def loader_inner(**kwargs: Any) -> Encoder: citation=LLM2VEC_CITATION, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_sheared_llama_unsupervised = ModelMeta( @@ -322,4 +329,5 @@ def loader_inner(**kwargs: Any) -> Encoder: citation=LLM2VEC_CITATION, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, ) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 5233ecec6..140d8bac7 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -7,7 +7,7 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader from mteb.models.e5_models import E5_TRAINING_DATA -from .bge_models import bge_m_training_data, bge_training_data +from .bge_models import bge_m3_training_data, bge_training_data from .sentence_transformers_models import sent_trf_training_dataset Haon_Chen__speed_embedding_7b_instruct = ModelMeta( @@ -22,6 +22,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/Haon-Chen/speed-embedding-7b-instruct", similarity_fn_name="cosine", @@ -42,6 +43,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Gameselo/STS-multilingual-mpnet-base-v2", similarity_fn_name="cosine", @@ -62,6 +64,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", similarity_fn_name="cosine", @@ -82,6 +85,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-v1", similarity_fn_name="cosine", @@ -102,6 +106,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/Hum-Works/lodestone-base-4096-v1", similarity_fn_name="cosine", @@ -164,6 +169,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Jaume/gemma-2b-embeddings", similarity_fn_name="cosine", @@ -184,6 +190,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/BeastyZ/e5-R-mistral-7b", similarity_fn_name="cosine", @@ -210,6 +217,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-base", similarity_fn_name="cosine", @@ -235,6 +243,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-large", similarity_fn_name="cosine", @@ -260,6 +269,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-small", similarity_fn_name="cosine", @@ -280,6 +290,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Bulbasaur", similarity_fn_name="cosine", @@ -301,6 +312,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Ivysaur", similarity_fn_name="cosine", @@ -322,6 +334,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Squirtle", similarity_fn_name="cosine", @@ -343,6 +356,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Venusaur", similarity_fn_name="cosine", @@ -364,6 +378,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Wartortle", similarity_fn_name="cosine", @@ -385,6 +400,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/gte-micro", similarity_fn_name="cosine", @@ -405,6 +421,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/gte-micro-v4", similarity_fn_name="cosine", @@ -425,6 +442,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/OrdalieTech/Solon-embeddings-large-0.1", similarity_fn_name="cosine", @@ -445,6 +463,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", @@ -465,6 +484,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", similarity_fn_name="cosine", @@ -487,6 +507,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", @@ -509,6 +530,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", similarity_fn_name="cosine", @@ -531,6 +553,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", similarity_fn_name="cosine", @@ -553,6 +576,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", @@ -573,6 +597,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/consciousAI/cai-lunaris-text-embeddings", similarity_fn_name="cosine", @@ -593,6 +618,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/consciousAI/cai-stellaris-text-embeddings", similarity_fn_name="cosine", @@ -613,6 +639,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/bge-m3-custom-fr", similarity_fn_name="cosine", @@ -633,6 +660,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.2", similarity_fn_name="cosine", @@ -653,6 +681,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.3", similarity_fn_name="cosine", @@ -673,6 +702,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.4", similarity_fn_name="cosine", @@ -694,6 +724,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-base", similarity_fn_name="cosine", @@ -714,6 +745,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-large", similarity_fn_name="cosine", @@ -734,6 +766,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-small", similarity_fn_name="cosine", @@ -754,6 +787,7 @@ license="gpl-3.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/OrlikB/KartonBERT-USE-base-v1", similarity_fn_name="cosine", @@ -774,6 +808,7 @@ license="lgpl", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/OrlikB/st-polish-kartonberta-base-alpha-v1", similarity_fn_name="cosine", @@ -794,6 +829,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-base", similarity_fn_name="cosine", @@ -814,6 +850,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/dwzhu/e5-base-4k", similarity_fn_name="cosine", @@ -834,6 +871,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-large", similarity_fn_name="cosine", @@ -854,6 +892,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-small", similarity_fn_name="cosine", @@ -874,6 +913,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-roberta-base", similarity_fn_name="cosine", @@ -894,6 +934,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-roberta-large", similarity_fn_name="cosine", @@ -960,6 +1001,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-1b1", similarity_fn_name="cosine", @@ -1026,6 +1068,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-3b", similarity_fn_name="cosine", @@ -1092,6 +1135,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-560m", similarity_fn_name="cosine", @@ -1158,6 +1202,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-7b1", similarity_fn_name="cosine", @@ -1178,6 +1223,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-Embedding-v0", similarity_fn_name="cosine", @@ -1198,6 +1244,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-all-MiniLM-L6-v2", similarity_fn_name="cosine", @@ -1218,6 +1265,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-large-Embedding-v0", similarity_fn_name="cosine", @@ -1238,6 +1286,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-small-Embedding-v0", similarity_fn_name="cosine", @@ -1258,6 +1307,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/bigscience/sgpt-bloom-7b1-msmarco", similarity_fn_name="cosine", @@ -1278,6 +1328,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/aari1995/German_Semantic_STS_V2", similarity_fn_name="cosine", @@ -1299,6 +1350,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/abhinand/MedEmbed-small-v0.1", similarity_fn_name="cosine", @@ -1325,6 +1377,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/NoInstruct-small-Embedding-v0", similarity_fn_name="cosine", @@ -1345,6 +1398,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/brahmairesearch/slx-v0.1", similarity_fn_name="cosine", @@ -1365,6 +1419,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/deepfile/embedder-100p", similarity_fn_name="cosine", @@ -1385,11 +1440,12 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/deepvk/USER-bge-m3", similarity_fn_name="cosine", use_instructions=None, - training_datasets=bge_m_training_data, # derived from. + training_datasets=bge_m3_training_data, # derived from. # not in MTEB: # "deepvk/ru-HNP": ["train"], # "deepvk/ru-WANLI": ["train"], @@ -1416,6 +1472,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/infgrad/stella-base-en-v2", similarity_fn_name="cosine", @@ -1436,6 +1493,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/malenia1/ternary-weight-embedding", similarity_fn_name="cosine", @@ -1456,6 +1514,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/omarelshehy/arabic-english-sts-matryoshka", similarity_fn_name="cosine", @@ -1486,6 +1545,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/openbmb/MiniCPM-Embedding", similarity_fn_name="cosine", @@ -1516,6 +1576,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/shibing624/text2vec-base-multilingual", similarity_fn_name="cosine", @@ -1537,6 +1598,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/silma-ai/silma-embeddding-matryoshka-v0.1", similarity_fn_name="cosine", @@ -1545,26 +1607,7 @@ adapted_from="/workspace/v3-matryoshka_aubmindlab-bert-base-arabertv02-2024-10-12_13-55-06/checkpoint-26250", superseded_by=None, ) -zeta_alpha_ai__Zeta_Alpha_E5_Mistral = ModelMeta( - name="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", - revision="3e6076bdc2ff592a2f95fbc04570e51db5aa0c0c", - release_date="2024-08-30", - languages=["eng_Latn"], - loader=None, - n_parameters=7110660096, - max_tokens=32768.0, - embed_dim=4096, - license="mit", - open_weights=True, - public_training_code=None, - framework=["PyTorch"], - reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", - similarity_fn_name="cosine", - use_instructions=None, - training_datasets=None, - adapted_from="intfloat/e5-mistral-7b-instruct", - superseded_by=None, -) + sbert_chinese_general_v1 = ModelMeta( name="DMetaSoul/sbert-chinese-general-v1", revision="bd27765956bcc2fcf682de0097819947ac10037e", @@ -1577,6 +1620,7 @@ license="apache-2", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/DMetaSoul/sbert-chinese-general-v1", similarity_fn_name="cosine", @@ -1601,6 +1645,7 @@ license="apache-2", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/DMetaSoul/Dmeta-embedding-zh-small/", similarity_fn_name="cosine", @@ -1620,6 +1665,7 @@ license="not specified", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/lier007/xiaobu-embedding", similarity_fn_name="cosine", @@ -1640,6 +1686,7 @@ license="not specified", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/lier007/xiaobu-embedding-v2", similarity_fn_name="cosine", @@ -1660,6 +1707,7 @@ license="not specified", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Classical/Yinka", similarity_fn_name="cosine", @@ -1680,6 +1728,7 @@ license="cc-by-nc-4.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Classical/Yinka", similarity_fn_name="cosine", @@ -1688,3 +1737,41 @@ training_datasets=None, # They "scraped" things from the internet, we don't know, could be leakage superseded_by=None, ) +ember_v1 = ModelMeta( + name="llmrails/ember-v1", + revision="5e5ce5904901f6ce1c353a95020f17f09e5d021d", + release_date="2023-10-10", + languages=["eng_Latn"], + n_parameters=335 * 1e6, + max_tokens=512, + embed_dim=1024, + license="mit", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/llmrails/ember-v1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + superseded_by=None, +) +amazon_titan_text_embeddings_v2 = ModelMeta( + name="amazon/Titan-text-embeddings-v2", + revision="1", + release_date="2024-04-30", + languages=["eng_Latn"], + n_parameters=None, + max_tokens=None, + embed_dim=None, + license="proprietary", + open_weights=False, + public_training_code=None, + public_training_data=None, + framework=[], + reference="https://huggingface.co/amazon/Titan-text-embeddings-v2", + similarity_fn_name="cosine", + use_instructions=False, + training_datasets=None, + superseded_by=None, +) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index afbf9df62..33da211c7 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -75,7 +75,8 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code="https://github.com/MinishLab/model2vec", # + public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) @@ -101,6 +102,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) m2v_base_output = ModelMeta( @@ -125,6 +127,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) m2v_multilingual_output = ModelMeta( @@ -149,6 +152,7 @@ def encode( superseded_by=None, training_datasets=None, public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) potion_base_2m = ModelMeta( @@ -173,6 +177,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) potion_base_4m = ModelMeta( @@ -197,6 +202,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) potion_base_8m = ModelMeta( @@ -221,4 +227,5 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) diff --git a/mteb/models/moka_models.py b/mteb/models/moka_models.py index d3943d78d..1504b4078 100644 --- a/mteb/models/moka_models.py +++ b/mteb/models/moka_models.py @@ -96,7 +96,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Not published + public_training_code=None, + public_training_data=None, # Not published training_datasets=m3e_dataset, ) @@ -117,7 +118,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Not published + public_training_code=None, + public_training_data=None, # Not published training_datasets=m3e_dataset, ) @@ -139,6 +141,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Not published + public_training_code=None, + public_training_data=None, # Not published training_datasets=m3e_dataset, ) diff --git a/mteb/models/mxbai_models.py b/mteb/models/mxbai_models.py index e0be5c9d9..c4bc7c3db 100644 --- a/mteb/models/mxbai_models.py +++ b/mteb/models/mxbai_models.py @@ -42,5 +42,6 @@ } """, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/no_instruct_sentence_models.py b/mteb/models/no_instruct_sentence_models.py index a0596b9bd..9ff5cf901 100644 --- a/mteb/models/no_instruct_sentence_models.py +++ b/mteb/models/no_instruct_sentence_models.py @@ -100,5 +100,6 @@ def encode( # type: ignore adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/nomic_models.py b/mteb/models/nomic_models.py index b2b054254..c2d06e2f6 100644 --- a/mteb/models/nomic_models.py +++ b/mteb/models/nomic_models.py @@ -90,6 +90,79 @@ def encode( # type: ignore return emb +nomic_training_data = { + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/data/contrastive_pretrain.yaml + # reddit_title_body + "RedditClustering": [], + "RedditClusteringP2P": [], + "RedditClustering.v2": [], + "RedditClusteringP2P.v2": [], + # amazon_reviews + # amazonqa + "AmazonPolarityClassification": [], + "AmazonReviewsClassification": [], + "AmazonCounterfactualClassification": [], + # paq + # s2orc_citation_titles + # s2orc_title_abstract + # s2orc_abstract_citation + # s2orc_abstract_body + # wikianswers + # wikipedia + "WikipediaRetrievalMultilingual": [], + "WikipediaRerankingMultilingual": [], + # gooaq + # codesearch + "CodeSearchNetCCRetrieval": [], + "COIRCodeSearchNetRetrieval": [], + # yahoo_title_answer + # yahoo_qa + # yahoo_title_question + "YahooAnswersTopicsClassification": [], + # agnews + # ccnews + # npr + # eli5 + # cnn + # stackexchange_duplicate_questions + # stackexchange_title_body + # stackexchange_body_body + "StackExchangeClustering.v2": [], + "StackExchangeClusteringP2P.v2": [], + # sentence_compression + # wikihow + # altlex + # quora + "QuoraRetrieval": [], + "NanoQuoraRetrieval": [], + # simplewiki + # squad + "FQuADRetrieval": [], + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/data/finetune_triplets.yaml + # msmaro + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + # nq_triples + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # nli_triplets + # reddit + # medi_wiki + # medi_stackexchange + # medi_flickr + # medi_supernli + # hotpot + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + # fever + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], +} + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/eval/mteb_eval/eval_mteb.py#L142-L159 model_prompts = { "Classification": "classification: ", @@ -138,8 +211,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, - public_training_code=None, - training_datasets=None, + public_training_data=None, + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, ) nomic_embed_v1 = ModelMeta( @@ -166,8 +240,9 @@ def encode( # type: ignore citation=NOMIC_CITATION, adapted_from=None, superseded_by="nomic-ai/nomic-embed-text-v1.5", - public_training_code=None, - training_datasets=None, + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, + public_training_data=None, ) nomic_embed_v1_ablated = ModelMeta( @@ -193,8 +268,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, - public_training_code=None, - training_datasets=None, + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, + public_training_data=None, ) @@ -221,8 +297,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, - public_training_code=None, - training_datasets=None, + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, + public_training_data=None, ) nomic_modern_bert_embed = ModelMeta( @@ -248,8 +325,10 @@ def encode( # type: ignore similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - adapted_from=None, + adapted_from="answerdotai/ModernBERT-base", + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_pretrain_modernbert.yaml", + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune_modernnomic.yaml superseded_by=None, - public_training_code=None, - training_datasets=None, + training_datasets=nomic_training_data, + public_training_data=None, ) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 1f345a62b..1997a8527 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -141,6 +141,7 @@ def encode( use_instructions=True, training_datasets=nvidia_training_datasets, public_training_code=None, + public_training_data=None, ) NV_embed_v1 = ModelMeta( @@ -164,4 +165,5 @@ def encode( use_instructions=True, training_datasets=nvidia_training_datasets, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 863c9d782..079e7c936 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -135,7 +135,8 @@ def _to_numpy(self, embedding_response) -> np.ndarray: similarity_fn_name="cosine", framework=["API"], use_instructions=False, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) text_embedding_3_large = ModelMeta( @@ -156,7 +157,8 @@ def _to_numpy(self, embedding_response) -> np.ndarray: framework=["API"], use_instructions=False, n_parameters=None, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, license=None, similarity_fn_name=None, @@ -179,7 +181,8 @@ def _to_numpy(self, embedding_response) -> np.ndarray: framework=["API"], use_instructions=False, n_parameters=None, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, license=None, similarity_fn_name=None, diff --git a/mteb/models/overview.py b/mteb/models/overview.py index c72fe2ed8..e23285ff6 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -15,10 +15,12 @@ arctic_models, bge_models, bm25, + cde_models, cohere_models, colbert_models, e5_instruct, e5_models, + gme_models, google_models, gritlm_models, gte_models, @@ -26,6 +28,7 @@ inf_models, jasper_models, jina_models, + lens_models, linq_models, llm2vec_models, misc_models, @@ -56,6 +59,7 @@ arctic_models, bge_models, bm25, + cde_models, cohere_models, colbert_models, e5_instruct, @@ -64,9 +68,11 @@ google_models, gritlm_models, gte_models, + gme_models, ibm_granite_models, inf_models, jina_models, + lens_models, linq_models, llm2vec_models, mxbai_models, @@ -210,6 +216,25 @@ def get_model_meta(model_name: str, revision: str | None = None) -> ModelMeta: return meta +empty_model_meta = ModelMeta( + name=None, + revision=None, + languages=None, + release_date=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=[], +) + + @lru_cache def model_meta_from_hf_hub(model_name: str) -> ModelMeta: try: @@ -234,26 +259,14 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: embed_dim=None, open_weights=True, public_training_code=None, + public_training_data=None, use_instructions=None, ) except Exception as e: logger.warning(f"Failed to extract metadata from model: {e}.") - return ModelMeta( - name=model_name, - revision=None, - languages=None, - release_date=None, - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=True, - public_training_code=None, - similarity_fn_name=None, - use_instructions=None, - training_datasets=None, - framework=[], - ) + meta = empty_model_meta + meta.name = model_name + return meta def model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta: @@ -273,6 +286,7 @@ def model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta: license=None, open_weights=True, public_training_code=None, + public_training_data=None, use_instructions=None, training_datasets=None, ) @@ -280,22 +294,7 @@ def model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta: logger.warning( f"Failed to extract metadata from model: {e}. Upgrading to sentence-transformers v3.0.0 or above is recommended." ) - meta = ModelMeta( - name=None, - revision=None, - languages=None, - release_date=None, - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=True, - public_training_code=None, - similarity_fn_name=None, - use_instructions=None, - training_datasets=None, - framework=[], - ) + meta = empty_model_meta return meta @@ -325,6 +324,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe license=None, open_weights=True, public_training_code=None, + public_training_data=None, use_instructions=None, training_datasets=None, ) @@ -332,20 +332,5 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe logger.warning( f"Failed to extract metadata from model: {e}. Upgrading to sentence-transformers v3.0.0 or above is recommended." ) - meta = ModelMeta( - name=None, - revision=None, - languages=None, - release_date=None, - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=True, - public_training_code=None, - similarity_fn_name=None, - use_instructions=None, - training_datasets=None, - framework=[], - ) + meta = empty_model_meta return meta diff --git a/mteb/models/piccolo_models.py b/mteb/models/piccolo_models.py index bb92b5567..d51487b8b 100644 --- a/mteb/models/piccolo_models.py +++ b/mteb/models/piccolo_models.py @@ -21,6 +21,7 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=None, # They don't specify ) @@ -42,5 +43,6 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=None, # They don't say ) diff --git a/mteb/models/promptriever_models.py b/mteb/models/promptriever_models.py index 7dc98a26a..df2204def 100644 --- a/mteb/models/promptriever_models.py +++ b/mteb/models/promptriever_models.py @@ -80,6 +80,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, citation=PROMPTRIEVER_CITATION, public_training_code=None, + public_training_data=None, ) promptriever_llama3 = ModelMeta( @@ -107,6 +108,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, citation=PROMPTRIEVER_CITATION, public_training_code=None, + public_training_data=None, ) @@ -135,6 +137,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, citation=PROMPTRIEVER_CITATION, public_training_code=None, + public_training_data=None, ) promptriever_mistral_v1 = ModelMeta( @@ -162,4 +165,5 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, citation=PROMPTRIEVER_CITATION, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/repllama_models.py b/mteb/models/repllama_models.py index 8faa2c490..ffe1f0bd8 100644 --- a/mteb/models/repllama_models.py +++ b/mteb/models/repllama_models.py @@ -172,6 +172,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, citation=REPLLAMA_CITATION, public_training_code=None, + public_training_data=None, ) @@ -199,5 +200,6 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, citation=REPLLAMA_CITATION, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/rerankers_custom.py b/mteb/models/rerankers_custom.py index bedfd0960..34adea7ff 100644 --- a/mteb/models/rerankers_custom.py +++ b/mteb/models/rerankers_custom.py @@ -11,6 +11,7 @@ from mteb.encoder_interface import Encoder from mteb.evaluation.evaluators.RetrievalEvaluator import DenseRetrievalExactSearch from mteb.model_meta import ModelMeta +from mteb.models.bge_models import bge_m3_training_data logger = logging.getLogger(__name__) @@ -209,6 +210,7 @@ def loader_inner(**kwargs: Any) -> Encoder: embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -233,6 +235,7 @@ def loader_inner(**kwargs: Any) -> Encoder: embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -289,9 +292,10 @@ def loader_inner(**kwargs: Any) -> Encoder: embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, - training_datasets=None, + training_datasets=bge_m3_training_data, framework=["Sentence Transformers", "PyTorch"], citation=""" @misc{li2023making, diff --git a/mteb/models/rerankers_monot5_based.py b/mteb/models/rerankers_monot5_based.py index f45addb18..320ee4bc7 100644 --- a/mteb/models/rerankers_monot5_based.py +++ b/mteb/models/rerankers_monot5_based.py @@ -301,6 +301,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -342,6 +343,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -365,6 +367,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -397,6 +400,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -452,6 +456,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -497,6 +502,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -542,6 +548,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -587,6 +594,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -610,6 +618,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -651,6 +660,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -674,6 +684,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -707,6 +718,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -854,6 +866,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -876,6 +889,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index 1869ce62d..683c8c502 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -6,42 +6,53 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader -from .bge_models import bge_training_data +from .bge_models import bge_m3_training_data -rubert_tiny2 = ModelMeta( - name="cointegrated/rubert-tiny2", +rubert_tiny = ModelMeta( + name="cointegrated/rubert-tiny", languages=["rus_Cyrl"], open_weights=True, - revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3", - release_date="2021-10-28", + revision="5441c5ea8026d4f6d7505ec004845409f1259fb1", + release_date="2021-05-24", n_parameters=29_400_000, embed_dim=312, license="mit", max_tokens=2048, - reference="https://huggingface.co/cointegrated/rubert-tiny2", + reference="https://huggingface.co/cointegrated/rubert-tiny", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code=None, - training_datasets=None, + public_training_code="https://gist.github.com/avidale/7bc6350f26196918bf339c01261f5c60", + training_datasets={ + # [Yandex Translate corpus](https://translate.yandex.ru/corpus), [OPUS-100](https://huggingface.co/datasets/opus100) + "Tatoeba": ["train"], + }, + adapted_from="google-bert/bert-base-multilingual-cased", + public_training_data=None, ) -rubert_tiny = ModelMeta( - name="cointegrated/rubert-tiny", +rubert_tiny2 = ModelMeta( + name="cointegrated/rubert-tiny2", languages=["rus_Cyrl"], open_weights=True, - revision="5441c5ea8026d4f6d7505ec004845409f1259fb1", - release_date="2021-05-24", + revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3", + release_date="2021-10-28", n_parameters=29_400_000, embed_dim=312, license="mit", max_tokens=2048, - reference="https://huggingface.co/cointegrated/rubert-tiny", + reference="https://huggingface.co/cointegrated/rubert-tiny2", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code=None, - training_datasets=None, + public_training_code="https://colab.research.google.com/drive/1mSWfIQ6PIlteLVZ9DKKpcorycgLIKZLf?usp=sharing", + training_datasets={ + # https://huggingface.co/datasets/cointegrated/ru-paraphrase-NMT-Leipzig + # Wikipedia https://huggingface.co/datasets/Madjogger/JamSpell_dataset + # https://huggingface.co/datasets/imvladikon/leipzig_corpora_collection + }, + adapted_from="cointegrated/rubert-tiny", + public_training_data=None, ) sbert_large_nlu_ru = ModelMeta( @@ -59,6 +70,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -77,7 +89,11 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, - training_datasets=None, + public_training_data=None, + training_datasets={ + # SNLI, MNLI + # https://github.com/brmson/dataset-sts + }, ) user_base_ru = ModelMeta( @@ -93,12 +109,13 @@ revision="436a489a2087d61aa670b3496a9915f84e46c861", release_date="2024-06-10", n_parameters=427_000_000, - embed_dim=1024, - license="Not specified", - max_tokens=512, # best guess - reference="https://huggingface.co/ai-forever/sbert_large_mt_nlu_ru", + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/deepvk/USER-base", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], + adapted_from="https://huggingface.co/deepvk/deberta-v1-base", use_instructions=True, citation="""@misc{deepvk2024user, title={USER: Universal Sentence Encoder for Russian}, @@ -108,13 +125,68 @@ year={2024}, } """, + training_datasets={ + "BibleNLPBitextMining": ["train"], + # https://github.com/unicamp-dl/mMARCO + # deepvk/ru-HNP + # deepvk/ru-WANLI + # MedNLI + # RCB + "TERRa": ["train"], + # Tapaco + # Opus100 + # BiblePar + # RudetoxifierDataDetox + # RuParadetox + "MIRACL": ["train"], + # MLDR + # Lenta + "MLSUMClusteringP2P": ["train"], + "MLSUMClusteringP2P.v2": ["train"], + "MLSUMClusteringS2S": ["train"], + "MLSUMClusteringS2S.v2": ["train"], + "MrTidyRetrieval": ["train"], + # "Panorama" + # PravoIsrael + # xlsum + # Fialka-v1 + # RussianKeywords + # Gazeta + # Gsm8k-ru + # DSumRu + # SummDialogNews + }, + public_training_code=None, + public_training_data=None, +) + +user_bge_m3 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="deepvk/USER-bge-m3", + revision="0cc6cfe48e260fb0474c753087a69369e88709ae", + ), + name="deepvk/USER-bge-m3", + languages=["rus_Cyrl"], + open_weights=True, + revision="0cc6cfe48e260fb0474c753087a69369e88709ae", + release_date="2024-07-05", + n_parameters=359_026_688, + embed_dim=1024, + license="apache-2.0", + max_tokens=8194, + reference="https://huggingface.co/deepvk/USER-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + adapted_from="https://huggingface.co/BAAI/bge-m3", + use_instructions=False, training_datasets={ "BibleNLPBitextMining": ["train"], "MLSUMClusteringP2P": ["train"], "MLSUMClusteringP2P.v2": ["train"], "MLSUMClusteringS2S": ["train"], "MLSUMClusteringS2S.v2": ["train"], - **bge_training_data, + **bge_m3_training_data, # not MTEB: # "deepvk/ru-HNP": ["train"], # "deepvk/ru-WANLI": ["train"], @@ -132,6 +204,7 @@ # "CarlBrendt/Summ_Dialog_News": ["train"], }, public_training_code=None, + public_training_data=None, ) deberta_v1_ru = ModelMeta( @@ -148,7 +221,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + # Wikipedia, Books, Twitter comments, Pikabu, Proza.ru, Film subtitles, News websites, and Social corpus public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -161,12 +236,13 @@ n_parameters=1280_000_000, embed_dim=768, license="Not specified", - max_tokens=512, # best guess + max_tokens=512, reference="https://huggingface.co/DeepPavlov/rubert-base-cased", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, citation="""@misc{kuratov2019adaptationdeepbidirectionalmultilingual, title={Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language}, @@ -194,6 +270,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, citation="""@misc{https://doi.org/10.48550/arxiv.2205.02340, doi = {10.48550/ARXIV.2205.02340}, @@ -222,7 +299,11 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, - training_datasets=None, + public_training_data=None, + training_datasets={ + # "SNLI": [], + "XNLI": ["dev"] + }, ) labse_en_ru = ModelMeta( @@ -239,8 +320,10 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code=None, + public_training_code="https://colab.research.google.com/drive/1dnPRn0-ugj3vZgSpyCC9sgslM2SuSfHy?usp=sharing", + public_training_data=None, training_datasets=None, + adapted_from="sentence-transformers/LaBSE", ) rubert_tiny_turbo = ModelMeta( @@ -258,8 +341,10 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # source model in unknown # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + adapted_from="cointegrated/rubert-tiny2", ) labse_ru_turbo = ModelMeta( @@ -276,9 +361,11 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets=None, # source model in unknown + training_datasets=None, # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, public_training_code=None, + adapted_from="cointegrated/LaBSE-en-ru", + public_training_data=None, ) @@ -305,8 +392,24 @@ embed_dim=1024, license="mit", similarity_fn_name="cosine", + adapted_from="ai-forever/ruRoberta-large", + training_datasets={ + # https://huggingface.co/ai-forever/ruRoberta-large + # https://huggingface.co/datasets/IlyaGusev/yandex_q_full + # https://huggingface.co/datasets/IlyaGusev/pikabu + # https://huggingface.co/datasets/IlyaGusev/ru_stackoverflow + # https://huggingface.co/datasets/IlyaGusev/habr + # https://huggingface.co/datasets/its5Q/habr_qna + # NewsCommentary + # MultiParaCrawl + "XNLI": [], + "XNLIV2": [], + "LanguageClassification": [], # XNLI + "MIRACLReranking": ["train"], + "MIRACLRetrieval": ["train"], + }, + public_training_data=None, public_training_code=None, - training_datasets=None, framework=["Sentence Transformers", "PyTorch"], citation="""@misc{snegirev2024russianfocusedembeddersexplorationrumteb, title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design}, diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index fd5487166..c5ba79933 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -6,6 +6,8 @@ from mteb.model_meta import ModelMeta from mteb.models.instruct_wrapper import instruct_wrapper +from .e5_instruct import E5_MISTRAL_TRAINING_DATA + def instruction_template( instruction: str, prompt_type: PromptType | None = None @@ -13,6 +15,19 @@ def instruction_template( return f"Instruct: {instruction}\nQuery: " if instruction else "" +SFR_TRAINING_DATA = { # inherits from e5 + **E5_MISTRAL_TRAINING_DATA, + # From previously released blogpost which now have been taken down: + "FiQA2018": ["train"], + "FiQA2018-PL": ["train"], + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + SFR_Embedding_2_R = ModelMeta( loader=partial( # type: ignore instruct_wrapper, @@ -41,16 +56,8 @@ def instruction_template( use_instructions=True, adapted_from="intfloat/e5-mistral-7b-instruct", public_training_code=None, - training_datasets={ # inherits from e5 - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - }, + public_training_data=None, + training_datasets=SFR_TRAINING_DATA, citation="""@misc{SFR-embedding-2, title={SFR-Embedding-2: Advanced Text Embedding with Multi-stage Training}, author={Rui Meng*, Ye Liu*, Shafiq Rayhan Joty, Caiming Xiong, Yingbo Zhou, Semih Yavuz}, @@ -86,14 +93,6 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, - training_datasets={ # inherits from e5 - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - }, + public_training_data=None, + training_datasets=SFR_TRAINING_DATA, ) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 63be6e925..73dcf8a66 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -129,6 +129,7 @@ adapted_from=None, training_datasets=sent_trf_training_dataset, public_training_code=None, + public_training_data=None, citation=SBERT_CITATION, ) @@ -151,6 +152,7 @@ training_datasets=sent_trf_training_dataset, public_training_code=None, citation=SBERT_CITATION, + public_training_data=None, ) paraphrase_multilingual_MiniLM_L12_v2 = ModelMeta( @@ -172,6 +174,7 @@ training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) public_training_code=None, citation=SBERT_CITATION, + public_training_data=None, ) paraphrase_multilingual_mpnet_base_v2 = ModelMeta( @@ -204,6 +207,7 @@ # "yahoo-answers": yahoo_answers_train_dataset, # "stack-exchange": stack_exchange_train_dataset, public_training_code=None, + public_training_data=None, ) labse = ModelMeta( @@ -233,6 +237,7 @@ primaryClass={cs.CL}, url={https://arxiv.org/abs/2007.01852}, }""", + public_training_data=None, ) multi_qa_MiniLM_L6_cos_v1 = ModelMeta( @@ -253,6 +258,7 @@ adapted_from="nreimers/MiniLM-L6-H384-uncased", training_datasets=sent_trf_training_dataset, # assumed public_training_code=None, + public_training_data=None, citation=SBERT_CITATION, ) @@ -272,39 +278,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # does sentence transformer count? - training_datasets={ - # source: frontmatter in readme - # trained on stack exchange, unsure if sources match - "StackExchangeClusteringP2P": ["test"], - "StackExchangeClusteringP2P.v2": ["test"], - "StackExchangeClustering": ["test"], - "StackExchangeClustering.v2": ["test"], - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], - # Non MTEB sources - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, + training_datasets=sent_trf_training_dataset, + public_training_code=None, + public_training_data=None, citation=SBERT_CITATION, ) @@ -337,6 +313,7 @@ doi = {10.48550/ARXIV.2112.09118}, }""", public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -368,4 +345,5 @@ # "sentence-transformers/natural-questions": ["train"], }, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 44aa1f860..92d5db7c8 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -29,7 +29,9 @@ framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_400M_v5", training_datasets=None, + # will be at https://github.com/NLPJCL/RAG-Retrieval public_training_code=None, + public_training_data=None, ) stella_en_1_5b = ModelMeta( @@ -54,8 +56,10 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", + # will be at https://github.com/NLPJCL/RAG-Retrieval training_datasets=None, public_training_code=None, + public_training_data=None, ) stella_large_zh_v3_1792d = ModelMeta( @@ -75,6 +79,7 @@ superseded_by="dunzhang/stella-mrl-large-zh-v3.5-1792d", adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets={ # Not in MTEB: # - infgrad/dialogue_rewrite_llm @@ -99,6 +104,7 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets={ # Not in MTEB: # - infgrad/dialogue_rewrite_llm @@ -124,6 +130,7 @@ superseded_by=None, adapted_from="dunzhang/stella-large-zh-v3-1792d", public_training_code=None, + public_training_data=None, training_datasets=None, # Not specified ) @@ -144,6 +151,7 @@ superseded_by=None, adapted_from="dunzhang/stella-mrl-large-zh-v3.5-1792d", public_training_code=None, + public_training_data=None, training_datasets={ # It's a bit unclear what they have trained on to be honest, because they don't list all # And they also have some rather cryptic description of their training procedure, but at diff --git a/mteb/models/text2vec_models.py b/mteb/models/text2vec_models.py index 12322e69e..86a9bcca4 100644 --- a/mteb/models/text2vec_models.py +++ b/mteb/models/text2vec_models.py @@ -21,7 +21,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Couldn't find it + public_training_code=None, + public_training_data=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB @@ -46,7 +47,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Couldn't find it + public_training_code=None, + public_training_data=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB @@ -87,7 +89,8 @@ use_instructions=False, superseded_by=None, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", - public_training_code=None, # Couldn't find it + public_training_code=None, + public_training_data=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB diff --git a/mteb/models/uae_models.py b/mteb/models/uae_models.py index e3cdaa843..a12a93632 100644 --- a/mteb/models/uae_models.py +++ b/mteb/models/uae_models.py @@ -91,4 +91,5 @@ def encode( "SNLI": [], }, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index a98bc041b..a637dee36 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -12,6 +12,11 @@ from .wrapper import Wrapper +VOYAGE_TRAINING_DATA = { + # Self-reported (message from VoyageAI member) + # synthetic data +} + def token_limit(max_tpm: int, interval: int = 60): limit_interval_start_ts = time.time() @@ -156,8 +161,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_finance_2 = ModelMeta( @@ -179,8 +185,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_law_2 = ModelMeta( @@ -202,8 +209,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_code_2 = ModelMeta( @@ -225,8 +233,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_large_2 = ModelMeta( @@ -248,8 +257,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_2 = ModelMeta( @@ -271,8 +281,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_multilingual_2 = ModelMeta( name="voyageai/voyage-multilingual-2", @@ -293,8 +304,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_3 = ModelMeta( @@ -316,8 +328,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_3_lite = ModelMeta( @@ -339,6 +352,79 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, +) + + +voyage_3_exp = ModelMeta( + name="voyageai/voyage-3-m-exp", + revision="1", + release_date=None, # not released + languages=None, # supported languages not specified + loader=partial( + VoyageWrapper, + model_name="voyage-3-m-exp", + model_prompts=model_prompts, + ), + max_tokens=32000, + embed_dim=512, + open_weights=False, + n_parameters=None, + license=None, + reference="https://huggingface.co/voyageai/voyage-3-m-exp", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, + training_datasets={ + # MTEB(eng, classic) training data: + "ArguAna": ["train"], + "ArguAna-PL": ["train"], + "NanoArguAnaRetrieval": ["train"], + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FiQA2018": ["train"], + "FiQA2018-PL": ["train"], # translation not trained on + "STS12": ["train"], + "STS22": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "ImdbClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringS2S.v2": ["train"], + "BiorxivClusteringP2P": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwentyNewsgroupsClustering.v2": ["train"], + "STSBenchmark": ["train"], + "STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on + }, public_training_code=None, + public_training_data=None, ) diff --git a/pyproject.toml b/pyproject.toml index 58c94a197..f42014e3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.10" +version = "1.29.16" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py index a192fa134..4ae87fdbc 100644 --- a/scripts/generate_metadata.py +++ b/scripts/generate_metadata.py @@ -242,6 +242,7 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: license=None, open_weights=True, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index 4a535bebb..7705de4d3 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -374,6 +374,7 @@ def test_reranker_same_ndcg1(tmp_path: Path): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, reference=None, similarity_fn_name=None, use_instructions=None,