Skip to content

Commit

Permalink
Merge branch 'refs/heads/v2.0.0' into refactor_retrieval
Browse files Browse the repository at this point in the history
# Conflicts:
#	mteb/models/salesforce_models.py
  • Loading branch information
Samoed committed Jan 25, 2025
2 parents d2a1151 + c26adee commit e1f6fbf
Show file tree
Hide file tree
Showing 77 changed files with 3,820 additions and 1,336 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ install:
install-for-tests:
@echo "--- 🚀 Installing project dependencies for test ---"
@echo "This ensures that the project is not installed in editable mode"
pip install ".[dev,speedtask]"
pip install ".[dev,speedtask,bm25s,pylate]"

lint:
@echo "--- 🧹 Running linters ---"
Expand Down Expand Up @@ -45,5 +45,5 @@ serve-docs:
model-load-test:
@echo "--- 🚀 Running model load test ---"
pip install ".[dev, speedtask, pylate,gritlm,xformers,model2vec]"
python scripts/extract_model_names.py $(BASE_BRANCH)
python scripts/extract_model_names.py $(BASE_BRANCH) --return_one_model_name_per_file
python tests/test_models/model_loading.py --model_name_file scripts/model_names.txt
200 changes: 100 additions & 100 deletions docs/mmteb/points_table.md

Large diffs are not rendered by default.

6 changes: 1 addition & 5 deletions mteb/abstasks/AbsTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def evaluate(
hf_subsets = copy(self.hf_subsets)

if subsets_to_run is not None: # allow overwrites of pre-filtering
hf_subsets = subsets_to_run
hf_subsets = [s for s in hf_subsets if s in subsets_to_run]

for hf_subset in hf_subsets:
logger.info(
Expand Down Expand Up @@ -278,10 +278,6 @@ def _calculate_metrics_from_split(
) -> DescriptiveStatistics:
raise NotImplementedError

@property
def metadata_dict(self) -> dict[str, Any]:
return dict(self.metadata)

@property
def languages(self) -> list[str]:
"""Returns the languages of the task"""
Expand Down
133 changes: 109 additions & 24 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def load_results(


MTEB_EN = Benchmark(
name="MTEB(eng, beta)",
name="MTEB(eng)",
tasks=MTEBTasks(
get_tasks(
tasks=[
Expand Down Expand Up @@ -128,7 +128,13 @@ def load_results(
get_task("STS22.v2", eval_splits=["test"], hf_subsets=["en"]),
),
),
description="English benchmarks from MTEB",
description="""The new English Massive Text Embedding Benchmark.
This benchmark was created to account for the fact that many models have now been finetuned
to tasks in the original MTEB, and contains tasks that are not as frequently used for model training.
This way the new benchmark and leaderboard can give our users a more realistic expectation of models' generalization performance.
The original MTEB leaderboard is available under the [MTEB(eng, classic)](http://mteb-leaderboard-2-demo.hf.space/?benchmark_name=MTEB%28eng%2C+classic%29) tab.
""",
citation="",
contacts=["KennethEnevoldsen", "Muennighoff"],
)
Expand Down Expand Up @@ -216,7 +222,12 @@ def load_results(
get_task("STS22", eval_splits=["test"], hf_subsets=["en"]),
)
),
description="The original English benchmark by Muennighoff et al., (2023).",
description="""The original English benchmark by Muennighoff et al., (2023).
This page is an adaptation of the [old MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
> We recommend that you use [MTEB(eng)](http://mteb-leaderboard-2-demo.hf.space/?benchmark_name=MTEB%28eng%29) instead,
as many models have been tuned on MTEB(eng, classic) datasets, and MTEB(eng) might give a more accurate representation of models' generalization performance.
""",
citation="""@inproceedings{muennighoff-etal-2023-mteb,
title = "{MTEB}: Massive Text Embedding Benchmark",
author = "Muennighoff, Niklas and
Expand Down Expand Up @@ -275,7 +286,7 @@ def load_results(
"STS22",
],
),
description="Main Russian benchmarks from MTEB",
description="A Russian version of the Massive Text Embedding Benchmark with a number of novel Russian tasks in all task categories of the original MTEB.",
reference="https://aclanthology.org/2023.eacl-main.148/",
citation="""@misc{snegirev2024russianfocusedembeddersexplorationrumteb,
title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design},
Expand Down Expand Up @@ -324,8 +335,8 @@ def load_results(
"LegalQuAD",
]
),
description="Legal benchmarks from MTEB.",
reference="https://aclanthology.org/2023.eacl-main.148/",
description="A benchmark of retrieval tasks in the legal domain.",
reference=None,
citation=None,
)

Expand Down Expand Up @@ -365,7 +376,10 @@ def load_results(
"Tatoeba",
]
),
description="BitextMining benchmark from MINERS",
description="""Bitext Mining texts from the MINERS benchmark, a benchmark designed to evaluate the
ability of multilingual LMs in semantic retrieval tasks,
including bitext mining and classification via retrieval-augmented contexts.
""",
reference="https://arxiv.org/pdf/2406.07424",
citation="""
@article{winata2024miners,
Expand Down Expand Up @@ -533,7 +547,7 @@ def load_results(
)
+ (get_task("STS22", eval_splits=["test"], hf_subsets=["fr"]),)
),
description="Main French benchmarks from MTEB",
description="MTEB-French, a French expansion of the original benchmark with high-quality native French datasets.",
reference="https://arxiv.org/abs/2405.20468",
citation="""@misc{ciancone2024mtebfrenchresourcesfrenchsentence,
title={MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis},
Expand Down Expand Up @@ -581,7 +595,7 @@ def load_results(
"STS22",
],
),
description="Main German benchmarks from MTEB",
description="A benchmark for text-embedding performance in German.",
reference="https://arxiv.org/html/2401.02709v1",
citation="""@misc{wehrli2024germantextembeddingclustering,
title={German Text Embedding Clustering Benchmark},
Expand All @@ -592,6 +606,7 @@ def load_results(
primaryClass={cs.CL},
url={https://arxiv.org/abs/2401.02709},
}""",
contacts=["slvnwhrl"],
)


Expand All @@ -612,7 +627,7 @@ def load_results(
"KorSTS",
],
),
description="Main Korean benchmarks from MTEB",
description="A benchmark and leaderboard for evaluation of text embedding in Korean.",
reference=None,
citation=None,
)
Expand Down Expand Up @@ -649,14 +664,19 @@ def load_results(
)
+ (get_task("STS22", eval_splits=["test"], hf_subsets=["pl"]),),
),
description="Main Polish benchmarks from MTEB",
description="""Polish Massive Text Embedding Benchmark (PL-MTEB), a comprehensive benchmark for text embeddings in Polish. The PL-MTEB consists of 28 diverse NLP
tasks from 5 task types. With tasks adapted based on previously used datasets by the Polish
NLP community. In addition, a new PLSC (Polish Library of Science Corpus) dataset was created
consisting of titles and abstracts of scientific publications in Polish, which was used as the basis for
two novel clustering tasks.""", # Rephrased from the abstract
reference="https://arxiv.org/abs/2405.10138",
citation="""@article{poswiata2024plmteb,
title={PL-MTEB: Polish Massive Text Embedding Benchmark},
author={Rafał Poświata and Sławomir Dadas and Michał Perełkiewicz},
journal={arXiv preprint arXiv:2405.10138},
year={2024}
}""",
contacts=["rafalposwiata"],
)

MTEB_code = Benchmark(
Expand Down Expand Up @@ -693,14 +713,14 @@ def load_results(
"typescript",
],
),
description="Main code benchmarks from MTEB",
description="A massive code embedding benchmark covering retrieval tasks in a miriad of popular programming languages.",
reference=None,
citation=None,
)


MTEB_multilingual = Benchmark(
name="MTEB(Multilingual, beta)",
name="MTEB(Multilingual)",
tasks=get_tasks(
tasks=[
"BornholmBitextMining",
Expand Down Expand Up @@ -838,10 +858,10 @@ def load_results(
"MIRACLRetrievalHardNegatives",
],
),
description="The Multilingual benchmarks from MMTEB. Currently under development.",
description="A large-scale multilingual expansion of MTEB, driven mainly by highly-curated community contributions covering 250+ languages.",
reference=None,
citation=None,
contacts=["KennethEnevoldsen"],
contacts=["KennethEnevoldsen", "isaac-chung"],
)

MTEB_JPN = Benchmark(
Expand Down Expand Up @@ -873,7 +893,7 @@ def load_results(
"ESCIReranking",
],
),
description="Main Japanese benchmarks from MTEB",
description="JMTEB is a benchmark for evaluating Japanese text embedding models.",
reference="https://github.com/sbintuitions/JMTEB",
citation=None,
)
Expand Down Expand Up @@ -913,7 +933,7 @@ def load_results(
]

MTEB_INDIC = Benchmark(
name="MTEB(Indic, beta)",
name="MTEB(Indic)",
tasks=get_tasks(
tasks=[
# Bitext
Expand Down Expand Up @@ -950,10 +970,10 @@ def load_results(
languages=indic_languages,
exclusive_language_filter=True,
),
description="Main Indic benchmark from MMTEB",
description="A regional geopolitical text embedding benchmark targetting embedding performance on Indic languages.",
reference=None,
citation=None,
contacts=["KennethEnevoldsen"],
contacts=["KennethEnevoldsen", "isaac-chung"],
)


Expand Down Expand Up @@ -1001,7 +1021,7 @@ def load_results(
]

MTEB_EU = Benchmark(
name="MTEB(Europe, beta)",
name="MTEB(Europe)",
tasks=get_tasks(
tasks=[
"BornholmBitextMining",
Expand Down Expand Up @@ -1082,10 +1102,10 @@ def load_results(
languages=eu_languages,
exclusive_language_filter=True,
),
description="Main European benchmark from MMTEB",
description="A regional geopolitical text embedding benchmark targetting embedding performance on European languages.",
reference=None,
citation=None,
contacts=["KennethEnevoldsen"],
contacts=["KennethEnevoldsen", "isaac-chung"],
)

LONG_EMBED = Benchmark(
Expand All @@ -1100,7 +1120,10 @@ def load_results(
"LEMBWikimQARetrieval",
],
),
description="The main benchmark for evaluating long document retrieval.",
description="""LongEmbed is a benchmark oriented at exploring models' performance on long-context retrieval.
The benchmark comprises two synthetic tasks and four carefully chosen real-world tasks,
featuring documents of varying length and dispersed target information.
""", # Pieced together from paper abstract.
reference="https://arxiv.org/abs/2404.12096v2",
citation="""@article{zhu2024longembed,
title={LongEmbed: Extending Embedding Models for Long Context Retrieval},
Expand All @@ -1115,7 +1138,13 @@ def load_results(
tasks=get_tasks(
tasks=["BrightRetrieval"],
),
description="A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.",
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
BRIGHT is the first text retrieval
benchmark that requires intensive reasoning to retrieve relevant documents with
a dataset consisting of 1,384 real-world queries spanning diverse domains, such as
economics, psychology, mathematics, and coding. These queries are drawn from
naturally occurring and carefully curated human data.
""",
reference="https://brightbenchmark.github.io/",
citation="""@article{su2024bright,
title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},
Expand Down Expand Up @@ -1148,3 +1177,59 @@ def load_results(
reference="https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6",
citation=None,
)

C_MTEB = Benchmark(
name="MTEB(Chinese)",
tasks=MTEBTasks(
get_tasks(
tasks=[
"T2Retrieval",
"MMarcoRetrieval",
"DuRetrieval",
"CovidRetrieval",
"CmedqaRetrieval",
"EcomRetrieval",
"MedicalRetrieval",
"VideoRetrieval",
"T2Reranking",
"MMarcoReranking",
"CMedQAv1-reranking",
"CMedQAv2-reranking",
"Ocnli",
"Cmnli",
"CLSClusteringS2S",
"CLSClusteringP2P",
"ThuNewsClusteringS2S",
"ThuNewsClusteringP2P",
"LCQMC",
"PAWSX",
"AFQMC",
"QBQTC",
"TNews",
"IFlyTek",
"Waimai",
"OnlineShopping",
"JDReview",
],
)
+ get_tasks(tasks=["MultilingualSentiment"], eval_splits=["test"])
+ get_tasks(
tasks=[
"ATEC",
"BQ",
"STSB",
],
eval_splits=["validation"],
)
),
description="The Chinese Massive Text Embedding Benchmark (C-MTEB) is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets.",
reference="https://github.com/FlagOpen/FlagEmbedding/tree/master/research/C_MTEB",
citation="""@misc{c-pack,
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
year={2023},
eprint={2309.07597},
archivePrefix={arXiv},
primaryClass={cs.CL}
}""",
)
21 changes: 1 addition & 20 deletions mteb/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,26 +374,7 @@ def main():
add_create_meta_parser(subparsers)

args = parser.parse_args()

# If no subcommand is provided, default to run with a deprecation warning
if not hasattr(args, "func"):
logger.warning(
"Using `mteb` without a subcommand is deprecated. Use `mteb run` instead.",
DeprecationWarning,
)
# Set default arguments for 'run' if no subcommand is provided
default_args = parser.parse_args(
["run"]
+ list(map(str, args._get_args()))
+ [
f"--{k}" if v is None else f"--{k}={v}"
for k, v in vars(args).items()
if k != "func"
]
)
default_args.func(default_args)
else:
args.func(args)
args.func(args)


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit e1f6fbf

Please sign in to comment.