Merge branch 'refs/heads/v2.0.0' into refactor_retrieval

# Conflicts: # mteb/models/salesforce_models.py
embeddings-benchmark · Jan 25, 2025 · e1f6fbf · e1f6fbf
2 parents d2a1151 + c26adee
commit e1f6fbf
Show file tree

Hide file tree

Showing 77 changed files with 3,820 additions and 1,336 deletions.
diff --git a/Makefile b/Makefile
@@ -5,7 +5,7 @@ install:
 install-for-tests:
 	@echo "--- 🚀 Installing project dependencies for test ---"
 	@echo "This ensures that the project is not installed in editable mode"
-	pip install ".[dev,speedtask]"
+	pip install ".[dev,speedtask,bm25s,pylate]"
 
 lint:
 	@echo "--- 🧹 Running linters ---"
@@ -45,5 +45,5 @@ serve-docs:
 model-load-test:
 	@echo "--- 🚀 Running model load test ---"
 	pip install ".[dev, speedtask, pylate,gritlm,xformers,model2vec]"
-	python scripts/extract_model_names.py $(BASE_BRANCH)
+	python scripts/extract_model_names.py $(BASE_BRANCH) --return_one_model_name_per_file
 	python tests/test_models/model_loading.py --model_name_file scripts/model_names.txt
diff --git a/docs/mmteb/points_table.md b/docs/mmteb/points_table.md
diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py
@@ -143,7 +143,7 @@ def evaluate(
             hf_subsets = copy(self.hf_subsets)
 
         if subsets_to_run is not None:  # allow overwrites of pre-filtering
-            hf_subsets = subsets_to_run
+            hf_subsets = [s for s in hf_subsets if s in subsets_to_run]
 
         for hf_subset in hf_subsets:
             logger.info(
@@ -278,10 +278,6 @@ def _calculate_metrics_from_split(
     ) -> DescriptiveStatistics:
         raise NotImplementedError
 
-    @property
-    def metadata_dict(self) -> dict[str, Any]:
-        return dict(self.metadata)
-
     @property
     def languages(self) -> list[str]:
         """Returns the languages of the task"""

diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
@@ -71,7 +71,7 @@ def load_results(
 
 
 MTEB_EN = Benchmark(
-    name="MTEB(eng, beta)",
+    name="MTEB(eng)",
     tasks=MTEBTasks(
         get_tasks(
             tasks=[
@@ -128,7 +128,13 @@ def load_results(
             get_task("STS22.v2", eval_splits=["test"], hf_subsets=["en"]),
         ),
     ),
-    description="English benchmarks from MTEB",
+    description="""The new English Massive Text Embedding Benchmark.
+This benchmark was created to account for the fact that many models have now been finetuned
+to tasks in the original MTEB, and contains tasks that are not as frequently used for model training.
+This way the new benchmark and leaderboard can give our users a more realistic expectation of models' generalization performance.
+
+The original MTEB leaderboard is available under the [MTEB(eng, classic)](http://mteb-leaderboard-2-demo.hf.space/?benchmark_name=MTEB%28eng%2C+classic%29) tab.
+    """,
     citation="",
     contacts=["KennethEnevoldsen", "Muennighoff"],
 )
@@ -216,7 +222,12 @@ def load_results(
             get_task("STS22", eval_splits=["test"], hf_subsets=["en"]),
         )
     ),
-    description="The original English benchmark by Muennighoff et al., (2023).",
+    description="""The original English benchmark by Muennighoff et al., (2023).
+This page is an adaptation of the [old MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
+
+> We recommend that you use [MTEB(eng)](http://mteb-leaderboard-2-demo.hf.space/?benchmark_name=MTEB%28eng%29) instead,
+as many models have been tuned on MTEB(eng, classic) datasets, and MTEB(eng) might give a more accurate representation of models' generalization performance.
+    """,
     citation="""@inproceedings{muennighoff-etal-2023-mteb,
     title = "{MTEB}: Massive Text Embedding Benchmark",
     author = "Muennighoff, Niklas  and
@@ -275,7 +286,7 @@ def load_results(
             "STS22",
         ],
     ),
-    description="Main Russian benchmarks from MTEB",
+    description="A Russian version of the Massive Text Embedding Benchmark with a number of novel Russian tasks in all task categories of the original MTEB.",
     reference="https://aclanthology.org/2023.eacl-main.148/",
     citation="""@misc{snegirev2024russianfocusedembeddersexplorationrumteb,
       title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design}, 
@@ -324,8 +335,8 @@ def load_results(
             "LegalQuAD",
         ]
     ),
-    description="Legal benchmarks from MTEB.",
-    reference="https://aclanthology.org/2023.eacl-main.148/",
+    description="A benchmark of retrieval tasks in the legal domain.",
+    reference=None,
     citation=None,
 )
 
@@ -365,7 +376,10 @@ def load_results(
             "Tatoeba",
         ]
     ),
-    description="BitextMining benchmark from MINERS",
+    description="""Bitext Mining texts from the MINERS benchmark, a benchmark designed to evaluate the
+    ability of multilingual LMs in semantic retrieval tasks,
+    including bitext mining and classification via retrieval-augmented contexts.
+    """,
     reference="https://arxiv.org/pdf/2406.07424",
     citation="""
     @article{winata2024miners,
@@ -533,7 +547,7 @@ def load_results(
         )
         + (get_task("STS22", eval_splits=["test"], hf_subsets=["fr"]),)
     ),
-    description="Main French benchmarks from MTEB",
+    description="MTEB-French, a French expansion of the original benchmark with high-quality native French datasets.",
     reference="https://arxiv.org/abs/2405.20468",
     citation="""@misc{ciancone2024mtebfrenchresourcesfrenchsentence,
       title={MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis}, 
@@ -581,7 +595,7 @@ def load_results(
             "STS22",
         ],
     ),
-    description="Main German benchmarks from MTEB",
+    description="A benchmark for text-embedding performance in German.",
     reference="https://arxiv.org/html/2401.02709v1",
     citation="""@misc{wehrli2024germantextembeddingclustering,
       title={German Text Embedding Clustering Benchmark}, 
@@ -592,6 +606,7 @@ def load_results(
       primaryClass={cs.CL},
       url={https://arxiv.org/abs/2401.02709}, 
 }""",
+    contacts=["slvnwhrl"],
 )
 
 
@@ -612,7 +627,7 @@ def load_results(
             "KorSTS",
         ],
     ),
-    description="Main Korean benchmarks from MTEB",
+    description="A benchmark and leaderboard for evaluation of text embedding in Korean.",
     reference=None,
     citation=None,
 )
@@ -649,14 +664,19 @@ def load_results(
         )
         + (get_task("STS22", eval_splits=["test"], hf_subsets=["pl"]),),
     ),
-    description="Main Polish benchmarks from MTEB",
+    description="""Polish Massive Text Embedding Benchmark (PL-MTEB), a comprehensive benchmark for text embeddings in Polish. The PL-MTEB consists of 28 diverse NLP
+tasks from 5 task types. With tasks adapted based on previously used datasets by the Polish
+NLP community. In addition, a new PLSC (Polish Library of Science Corpus) dataset was created
+consisting of titles and abstracts of scientific publications in Polish, which was used as the basis for
+two novel clustering tasks.""",  # Rephrased from the abstract
     reference="https://arxiv.org/abs/2405.10138",
     citation="""@article{poswiata2024plmteb,
     title={PL-MTEB: Polish Massive Text Embedding Benchmark},
     author={Rafał Poświata and Sławomir Dadas and Michał Perełkiewicz},
     journal={arXiv preprint arXiv:2405.10138},
     year={2024}
 }""",
+    contacts=["rafalposwiata"],
 )
 
 MTEB_code = Benchmark(
@@ -693,14 +713,14 @@ def load_results(
             "typescript",
         ],
     ),
-    description="Main code benchmarks from MTEB",
+    description="A massive code embedding benchmark covering retrieval tasks in a miriad of popular programming languages.",
     reference=None,
     citation=None,
 )
 
 
 MTEB_multilingual = Benchmark(
-    name="MTEB(Multilingual, beta)",
+    name="MTEB(Multilingual)",
     tasks=get_tasks(
         tasks=[
             "BornholmBitextMining",
@@ -838,10 +858,10 @@ def load_results(
             "MIRACLRetrievalHardNegatives",
         ],
     ),
-    description="The Multilingual benchmarks from MMTEB. Currently under development.",
+    description="A large-scale multilingual expansion of MTEB, driven mainly by highly-curated community contributions covering 250+ languages.",
     reference=None,
     citation=None,
-    contacts=["KennethEnevoldsen"],
+    contacts=["KennethEnevoldsen", "isaac-chung"],
 )
 
 MTEB_JPN = Benchmark(
@@ -873,7 +893,7 @@ def load_results(
             "ESCIReranking",
         ],
     ),
-    description="Main Japanese benchmarks from MTEB",
+    description="JMTEB is a benchmark for evaluating Japanese text embedding models.",
     reference="https://github.com/sbintuitions/JMTEB",
     citation=None,
 )
@@ -913,7 +933,7 @@ def load_results(
 ]
 
 MTEB_INDIC = Benchmark(
-    name="MTEB(Indic, beta)",
+    name="MTEB(Indic)",
     tasks=get_tasks(
         tasks=[
             # Bitext
@@ -950,10 +970,10 @@ def load_results(
         languages=indic_languages,
         exclusive_language_filter=True,
     ),
-    description="Main Indic benchmark from MMTEB",
+    description="A regional geopolitical text embedding benchmark targetting embedding performance on Indic languages.",
     reference=None,
     citation=None,
-    contacts=["KennethEnevoldsen"],
+    contacts=["KennethEnevoldsen", "isaac-chung"],
 )
 
 
@@ -1001,7 +1021,7 @@ def load_results(
 ]
 
 MTEB_EU = Benchmark(
-    name="MTEB(Europe, beta)",
+    name="MTEB(Europe)",
     tasks=get_tasks(
         tasks=[
             "BornholmBitextMining",
@@ -1082,10 +1102,10 @@ def load_results(
         languages=eu_languages,
         exclusive_language_filter=True,
     ),
-    description="Main European benchmark from MMTEB",
+    description="A regional geopolitical text embedding benchmark targetting embedding performance on European languages.",
     reference=None,
     citation=None,
-    contacts=["KennethEnevoldsen"],
+    contacts=["KennethEnevoldsen", "isaac-chung"],
 )
 
 LONG_EMBED = Benchmark(
@@ -1100,7 +1120,10 @@ def load_results(
             "LEMBWikimQARetrieval",
         ],
     ),
-    description="The main benchmark for evaluating long document retrieval.",
+    description="""LongEmbed is a benchmark oriented at exploring models' performance on long-context retrieval.
+    The benchmark comprises two synthetic tasks and four carefully chosen real-world tasks,
+    featuring documents of varying length and dispersed target information.
+    """,  # Pieced together from paper abstract.
     reference="https://arxiv.org/abs/2404.12096v2",
     citation="""@article{zhu2024longembed,
   title={LongEmbed: Extending Embedding Models for Long Context Retrieval},
@@ -1115,7 +1138,13 @@ def load_results(
     tasks=get_tasks(
         tasks=["BrightRetrieval"],
     ),
-    description="A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.",
+    description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
+    BRIGHT is the first text retrieval
+    benchmark that requires intensive reasoning to retrieve relevant documents with
+    a dataset consisting of 1,384 real-world queries spanning diverse domains, such as
+    economics, psychology, mathematics, and coding. These queries are drawn from
+    naturally occurring and carefully curated human data.
+    """,
     reference="https://brightbenchmark.github.io/",
     citation="""@article{su2024bright,
   title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
@@ -1148,3 +1177,59 @@ def load_results(
     reference="https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6",
     citation=None,
 )
+
+C_MTEB = Benchmark(
+    name="MTEB(Chinese)",
+    tasks=MTEBTasks(
+        get_tasks(
+            tasks=[
+                "T2Retrieval",
+                "MMarcoRetrieval",
+                "DuRetrieval",
+                "CovidRetrieval",
+                "CmedqaRetrieval",
+                "EcomRetrieval",
+                "MedicalRetrieval",
+                "VideoRetrieval",
+                "T2Reranking",
+                "MMarcoReranking",
+                "CMedQAv1-reranking",
+                "CMedQAv2-reranking",
+                "Ocnli",
+                "Cmnli",
+                "CLSClusteringS2S",
+                "CLSClusteringP2P",
+                "ThuNewsClusteringS2S",
+                "ThuNewsClusteringP2P",
+                "LCQMC",
+                "PAWSX",
+                "AFQMC",
+                "QBQTC",
+                "TNews",
+                "IFlyTek",
+                "Waimai",
+                "OnlineShopping",
+                "JDReview",
+            ],
+        )
+        + get_tasks(tasks=["MultilingualSentiment"], eval_splits=["test"])
+        + get_tasks(
+            tasks=[
+                "ATEC",
+                "BQ",
+                "STSB",
+            ],
+            eval_splits=["validation"],
+        )
+    ),
+    description="The Chinese Massive Text Embedding Benchmark (C-MTEB) is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets.",
+    reference="https://github.com/FlagOpen/FlagEmbedding/tree/master/research/C_MTEB",
+    citation="""@misc{c-pack,
+      title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, 
+      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
+      year={2023},
+      eprint={2309.07597},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}""",
+)
diff --git a/mteb/cli.py b/mteb/cli.py
@@ -374,26 +374,7 @@ def main():
     add_create_meta_parser(subparsers)
 
     args = parser.parse_args()
-
-    # If no subcommand is provided, default to run with a deprecation warning
-    if not hasattr(args, "func"):
-        logger.warning(
-            "Using `mteb` without a subcommand is deprecated. Use `mteb run` instead.",
-            DeprecationWarning,
-        )
-        # Set default arguments for 'run' if no subcommand is provided
-        default_args = parser.parse_args(
-            ["run"]
-            + list(map(str, args._get_args()))
-            + [
-                f"--{k}" if v is None else f"--{k}={v}"
-                for k, v in vars(args).items()
-                if k != "func"
-            ]
-        )
-        default_args.func(default_args)
-    else:
-        args.func(args)
+    args.func(args)
 
 
 if __name__ == "__main__":