From 3f093c86a5e4bccd31e8a9ed860d1a33bd64b391 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 11 Jan 2025 17:41:12 +0100 Subject: [PATCH] fix: added annotations for training data (#1742) * fix: Added annotations for arctic embed models * added google and bge * added cohere * Added e5 * added bge based model2vec * annotated oAI * format and update annotations --- mteb/models/arctic_models.py | 183 ++++++++++++++++++ mteb/models/bge_models.py | 93 +++++++++ mteb/models/cohere_models.py | 12 ++ mteb/models/e5_models.py | 108 +++++++++++ mteb/models/google_models.py | 9 + mteb/models/model2vec_models.py | 96 ++++++++- mteb/models/openai_models.py | 9 + mteb/models/sentence_transformers_models.py | 172 +++++++++------- .../eng/StackExchangeClusteringP2P.py | 1 - 9 files changed, 609 insertions(+), 74 deletions(-) diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index e16de37f74..b4c2b97ac6 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -103,6 +103,32 @@ use_instructions=True, adapted_from="sentence-transformers/all-MiniLM-L6-v2", superseded_by=None, + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2405.05374 + # splits not specified to assuming everything + # in MTEB + "NQ": ["test"], + "NQHardNegatives": ["test"], + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], + # not in MTEB + # trained on stack exchange (title-body) + # "stackexchange": [], + # potentially means that: + # "StackExchangeClusteringP2P": ["test"], + # "StackExchangeClusteringP2P.v2": ["test"], + # "StackExchangeClustering": ["test"], + # "StackExchangeClustering.v2": ["test"], + # not in MTEB + # "paq": [], + # "s2orc": [], + # "other": [], # undisclosed including webdata + }, # also use synthetic ) @@ -128,6 +154,32 @@ use_instructions=True, adapted_from="intfloat/e5-small-unsupervised", superseded_by=None, + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2405.05374 + # splits not specified to assuming everything + # in MTEB + "NQ": ["test"], + "NQHardNegatives": ["test"], + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], + # not in MTEB + # trained on stack exchange (title-body) + # "stackexchange": [], + # potentially means that: + # "StackExchangeClusteringP2P": ["test"], + # "StackExchangeClusteringP2P.v2": ["test"], + # "StackExchangeClustering": ["test"], + # "StackExchangeClustering.v2": ["test"], + # not in MTEB + # "paq": [], + # "s2orc": [], + # "other": [], # undisclosed including webdata + }, # also use synthetic ) @@ -153,6 +205,32 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v1.5", + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2405.05374 + # splits not specified to assuming everything + # in MTEB + "NQ": ["test"], + "NQHardNegatives": ["test"], + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], + # not in MTEB + # trained on stack exchange (title-body) + # "stackexchange": [], + # potentially means that: + # "StackExchangeClusteringP2P": ["test"], + # "StackExchangeClusteringP2P.v2": ["test"], + # "StackExchangeClustering": ["test"], + # "StackExchangeClustering.v2": ["test"], + # not in MTEB + # "paq": [], + # "s2orc": [], + # "other": [], # undisclosed including webdata + }, # also use synthetic ) arctic_embed_m_long = ModelMeta( @@ -178,6 +256,33 @@ use_instructions=True, adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2405.05374 + # splits not specified to assuming everything + # in MTEB + "NQ": ["test"], + "NQHardNegatives": ["test"], + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], + # trained on stack exchange, unsure if sources match + # not in MTEB + # trained on stack exchange (title-body) + # "stackexchange": [], + # potentially means that: + # "StackExchangeClusteringP2P": ["test"], + # "StackExchangeClusteringP2P.v2": ["test"], + # "StackExchangeClustering": ["test"], + # "StackExchangeClustering.v2": ["test"], + # not in MTEB + # "paq": [], + # "s2orc": [], + # "other": [], # undisclosed including webdata + }, # also use synthetic ) arctic_embed_l = ModelMeta( @@ -202,6 +307,32 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0", + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2405.05374 + # splits not specified to assuming everything + # in MTEB + "NQ": ["test"], + "NQHardNegatives": ["test"], + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], + # not in MTEB + # trained on stack exchange (title-body) + # "stackexchange": [], + # potentially means that: + # "StackExchangeClusteringP2P": ["test"], + # "StackExchangeClusteringP2P.v2": ["test"], + # "StackExchangeClustering": ["test"], + # "StackExchangeClustering.v2": ["test"], + # not in MTEB + # "paq": [], + # "s2orc": [], + # "other": [], # undisclosed including webdata + }, # also use synthetic ) arctic_embed_m_v1_5 = ModelMeta( @@ -254,6 +385,32 @@ use_instructions=True, adapted_from="Alibaba-NLP/gte-multilingual-base", superseded_by=None, + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2405.05374 + # splits not specified to assuming everything + # in MTEB + "NQ": ["test"], + "NQHardNegatives": ["test"], + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], + # not in MTEB + # trained on stack exchange (title-body) + # "stackexchange": [], + # potentially means that: + # "StackExchangeClusteringP2P": ["test"], + # "StackExchangeClusteringP2P.v2": ["test"], + # "StackExchangeClustering": ["test"], + # "StackExchangeClustering.v2": ["test"], + # not in MTEB + # "paq": [], + # "s2orc": [], + # "other": [], # undisclosed including webdata + }, # also use synthetic ) arctic_embed_l_v2_0 = ModelMeta( @@ -278,4 +435,30 @@ use_instructions=True, adapted_from="BAAI/bge-m3-retromae", superseded_by=None, + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2405.05374 + # splits not specified to assuming everything + # in MTEB + "NQ": ["test"], + "NQHardNegatives": ["test"], + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], + # not in MTEB + # trained on stack exchange (title-body) + # "stackexchange": [], + # potentially means that: + # "StackExchangeClusteringP2P": ["test"], + # "StackExchangeClusteringP2P.v2": ["test"], + # "StackExchangeClustering": ["test"], + # "StackExchangeClustering.v2": ["test"], + # not in MTEB + # "paq": [], + # "s2orc": [], + # "other": [], # undisclosed including webdata + }, # also use synthetic ) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index cc183374c6..276d28526f 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -27,6 +27,37 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP + public_training_code=None, # seemingly released (at least for some models, but the link is broken + training_datasets={ + # source: https://data.baai.ac.cn/details/BAAI-MTP + "NQ": ["test"], + "NQHardNegatives": ["test"], + "AmazonReviewsClassification": [ + "validation", + "test", + ], # assumed from: amazon_reviews_multi + "MLQARetrieval": [ + "validation", + "test", + ], # assumed from mlqa (question, context) + # not in mteb + # Dataset Pairs + # wudao (title, passage) + # cmrc2018 (query, context) + # dureader (query, context) + # simclue (sentence_a, sentence_b) + # csl (title, abstract) + # amazon_reviews_multi (title, body) + # wiki_atomic_edits (base_sentence, edited_sentence) + # mlqa (question, context) + # xlsum (title, summary) (title, text) + # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further + # "wikipedia": [], # title + section title, passage + # "reddit": [], # title, body + # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) + # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) + }, ) bge_base_en_v1_5 = ModelMeta( @@ -50,6 +81,37 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP + public_training_code=None, # seemingly released (at least for some models, but the link is broken + training_datasets={ + # source: https://data.baai.ac.cn/details/BAAI-MTP + "NQ": ["test"], + "NQHardNegatives": ["test"], + "AmazonReviewsClassification": [ + "validation", + "test", + ], # assumed from: amazon_reviews_multi + "MLQARetrieval": [ + "validation", + "test", + ], # assumed from mlqa (question, context) + # not in mteb + # Dataset Pairs + # wudao (title, passage) + # cmrc2018 (query, context) + # dureader (query, context) + # simclue (sentence_a, sentence_b) + # csl (title, abstract) + # amazon_reviews_multi (title, body) + # wiki_atomic_edits (base_sentence, edited_sentence) + # mlqa (question, context) + # xlsum (title, summary) (title, text) + # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further + # "wikipedia": [], # title + section title, passage + # "reddit": [], # title, body + # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) + # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) + }, ) bge_large_en_v1_5 = ModelMeta( @@ -73,4 +135,35 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP + public_training_code=None, # seemingly released (at least for some models, but the link is broken + training_datasets={ + # source: https://data.baai.ac.cn/details/BAAI-MTP + "NQ": ["test"], + "NQHardNegatives": ["test"], + "AmazonReviewsClassification": [ + "validation", + "test", + ], # assumed from: amazon_reviews_multi + "MLQARetrieval": [ + "validation", + "test", + ], # assumed from mlqa (question, context) + # not in mteb + # Dataset Pairs + # wudao (title, passage) + # cmrc2018 (query, context) + # dureader (query, context) + # simclue (sentence_a, sentence_b) + # csl (title, abstract) + # amazon_reviews_multi (title, body) + # wiki_atomic_edits (base_sentence, edited_sentence) + # mlqa (question, context) + # xlsum (title, summary) (title, text) + # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further + # "wikipedia": [], # title + section title, passage + # "reddit": [], # title, body + # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) + # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) + }, ) diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py index 43a797342d..4b34045f89 100644 --- a/mteb/models/cohere_models.py +++ b/mteb/models/cohere_models.py @@ -235,6 +235,9 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + public_training_data=False, # assumed + public_training_code=False, # assumed + training_datasets=None, ) cohere_eng_3 = ModelMeta( @@ -257,6 +260,9 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + public_training_data=False, # assumed + public_training_code=False, # assumed + training_datasets=None, ) cohere_mult_light_3 = ModelMeta( @@ -279,6 +285,9 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + public_training_data=False, # assumed + public_training_code=False, # assumed + training_datasets=None, ) cohere_eng_light_3 = ModelMeta( @@ -301,4 +310,7 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, + public_training_data=False, # assumed + public_training_code=False, # assumed + training_datasets=None, ) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index 4fee54de79..97b117002b 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -134,6 +134,26 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2212.03533 + # table 1: + # Wikipedia 150M + # mC4 160M + # Multilingual CC News 160M + # NLLB 160M + # Reddit 160M + # S2ORC 50M + # Stackexchange 50M + # xP3 80M + # Misc. SBERT Data 10M + # ---- + # from Misc. SBERT Data 10M: + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], # dev? + }, ) e5_mult_base = ModelMeta( @@ -156,6 +176,26 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2402.05672 + # table 1: + # Wikipedia 150M + # mC4 160M + # Multilingual CC News 160M + # NLLB 160M + # Reddit 160M + # S2ORC 50M + # Stackexchange 50M + # xP3 80M + # Misc. SBERT Data 10M + # ---- + # from Misc. SBERT Data 10M: + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], # dev? + }, ) e5_mult_large = ModelMeta( @@ -179,6 +219,26 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2402.05672 + # table 1: + # Wikipedia 150M + # mC4 160M + # Multilingual CC News 160M + # NLLB 160M + # Reddit 160M + # S2ORC 50M + # Stackexchange 50M + # xP3 80M + # Misc. SBERT Data 10M + # ---- + # from Misc. SBERT Data 10M: + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], # dev? + }, ) e5_eng_small_v2 = ModelMeta( @@ -201,6 +261,14 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2212.03533 + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], # dev? + }, ) e5_eng_small = ModelMeta( @@ -224,6 +292,14 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2212.03533 + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], # dev? + }, ) e5_eng_base_v2 = ModelMeta( @@ -249,6 +325,14 @@ use_instructions=True, superseded_by=None, adapted_from=None, + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2212.03533 + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], # dev? + }, ) e5_eng_large_v2 = ModelMeta( @@ -274,6 +358,14 @@ use_instructions=True, superseded_by=None, adapted_from=None, + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2212.03533 + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], # dev? + }, ) e5_large = ModelMeta( @@ -299,6 +391,14 @@ use_instructions=True, superseded_by="intfloat/e5-large-v2", adapted_from=None, + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2212.03533 + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], # dev? + }, ) e5_base = ModelMeta( @@ -324,4 +424,12 @@ use_instructions=True, superseded_by="intfloat/e5-base-v2", adapted_from=None, + public_training_data=False, # couldn't find + public_training_code=False, # couldn't find + training_datasets={ + # source: https://arxiv.org/pdf/2212.03533 + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], # dev? + }, ) diff --git a/mteb/models/google_models.py b/mteb/models/google_models.py index 4fcd21ae6e..1b4a4a13ff 100644 --- a/mteb/models/google_models.py +++ b/mteb/models/google_models.py @@ -152,6 +152,9 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, + public_training_data=False, # assumed + public_training_code=False, # assumed + training_datasets=None, ) google_text_emb_005 = ModelMeta( @@ -173,6 +176,9 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, + public_training_data=False, # assumed + public_training_code=False, # assumed + training_datasets=None, ) google_text_multilingual_emb_002 = ModelMeta( @@ -194,4 +200,7 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, + public_training_data=False, # assumed + public_training_code=False, # assumed + training_datasets=None, ) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index 1541d3ca3d..37da533457 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -27,7 +27,7 @@ def __init__( **kwargs: Additional arguments to pass to the wrapper. """ try: - from model2vec import StaticModel + from model2vec import StaticModel # type: ignore except ModuleNotFoundError as e: raise ModuleNotFoundError( "To use the Model2Vec models `model2vec` is required. Please install it with `pip install mteb[model2vec]`." @@ -63,7 +63,7 @@ def encode( open_weights=True, revision="5f4f5ca159b7321a8b39739bba0794fa0debddf4", release_date="2024-09-21", - n_parameters=103 * 1e6, + n_parameters=int(103 * 1e6), max_tokens=np.inf, # Theoretically infinite embed_dim=256, license="mit", @@ -72,6 +72,20 @@ def encode( reference="https://huggingface.co/minishlab/M2V_base_glove_subword", use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", + public_training_data=True, + public_training_code=None, # distilled model + training_datasets={ # same as adapted from + "NQ": ["test"], + "NQHardNegatives": ["test"], + "AmazonReviewsClassification": [ + "validation", + "test", + ], + "MLQARetrieval": [ + "validation", + "test", + ], + }, superseded_by=None, ) @@ -86,7 +100,7 @@ def encode( open_weights=True, revision="38ebd7f10f71e67fa8db898290f92b82e9cfff2b", release_date="2024-09-21", - n_parameters=102 * 1e6, + n_parameters=int(102 * 1e6), max_tokens=np.inf, embed_dim=256, license="mit", @@ -96,6 +110,20 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + public_training_data=True, + public_training_code=None, # distilled model + training_datasets={ # same as adapted from + "NQ": ["test"], + "NQHardNegatives": ["test"], + "AmazonReviewsClassification": [ + "validation", + "test", + ], + "MLQARetrieval": [ + "validation", + "test", + ], + }, ) m2v_base_output = ModelMeta( @@ -108,7 +136,7 @@ def encode( open_weights=True, revision="02460ae401a22b09d2c6652e23371398329551e2", release_date="2024-09-21", - n_parameters=7.56 * 1e6, + n_parameters=int(7.56 * 1e6), max_tokens=np.inf, embed_dim=256, license="mit", @@ -118,6 +146,20 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + public_training_data=True, + public_training_code=None, # distilled model + training_datasets={ # same as adapted from + "NQ": ["test"], + "NQHardNegatives": ["test"], + "AmazonReviewsClassification": [ + "validation", + "test", + ], + "MLQARetrieval": [ + "validation", + "test", + ], + }, ) m2v_multilingual_output = ModelMeta( @@ -130,7 +172,7 @@ def encode( open_weights=True, revision="2cf4ec4e1f51aeca6c55cf9b93097d00711a6305", release_date="2024-09-21", - n_parameters=128 * 1e6, + n_parameters=int(128 * 1e6), max_tokens=np.inf, embed_dim=256, license="mit", @@ -140,6 +182,8 @@ def encode( use_instructions=False, adapted_from="sentence-transformers/LaBSE", superseded_by=None, + public_training_data=True, + public_training_code=None, # distilled model ) potion_base_2m = ModelMeta( @@ -162,6 +206,20 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + public_training_data=True, + public_training_code=None, # distilled model + training_datasets={ # same as adapted from + "NQ": ["test"], + "NQHardNegatives": ["test"], + "AmazonReviewsClassification": [ + "validation", + "test", + ], + "MLQARetrieval": [ + "validation", + "test", + ], + }, ) potion_base_4m = ModelMeta( @@ -184,6 +242,20 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + public_training_data=True, + public_training_code=None, # distilled model + training_datasets={ # same as adapted from + "NQ": ["test"], + "NQHardNegatives": ["test"], + "AmazonReviewsClassification": [ + "validation", + "test", + ], + "MLQARetrieval": [ + "validation", + "test", + ], + }, ) potion_base_8m = ModelMeta( @@ -206,4 +278,18 @@ def encode( use_instructions=False, adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, + public_training_data=True, + public_training_code=None, # distilled model + training_datasets={ # same as adapted from + "NQ": ["test"], + "NQHardNegatives": ["test"], + "AmazonReviewsClassification": [ + "validation", + "test", + ], + "MLQARetrieval": [ + "validation", + "test", + ], + }, ) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index aecacf549a..c187bfa317 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -136,6 +136,9 @@ def _to_numpy(self, embedding_response) -> np.ndarray: similarity_fn_name="cosine", framework=["API"], use_instructions=False, + public_training_data=False, # assumed + public_training_code=False, # assumed + training_datasets=None, ) text_embedding_3_large = ModelMeta( name="openai/text-embedding-3-large", @@ -156,6 +159,9 @@ def _to_numpy(self, embedding_response) -> np.ndarray: use_instructions=False, n_parameters=None, memory_usage=None, + public_training_data=False, # assumed + public_training_code=False, # assumed + training_datasets=None, ) text_embedding_ada_002 = ModelMeta( name="openai/text-embedding-ada-002", @@ -176,4 +182,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray: use_instructions=False, n_parameters=None, memory_usage=None, + public_training_data=False, # assumed + public_training_code=False, # assumed + training_datasets=None, ) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index ea02508c36..28349d60d9 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -71,34 +71,45 @@ embed_dim=384, license="apache-2.0", max_tokens=256, - reference="https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", + reference="https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, superseded_by=None, adapted_from=None, + public_training_code=False, # does sentence transformer count? + public_training_data=True, training_datasets={ - "s2orc": ["train"], - "flax-sentence-embeddings/stackexchange_xml": ["train"], - "ms_marco": ["train"], - "gooaq": ["train"], - "yahoo_answers_topics": ["train"], - "code_search_net": ["train"], - "search_qa": ["train"], - "eli5": ["train"], - "snli": ["train"], - "multi_nli": ["train"], - "wikihow": ["train"], - "natural_questions": ["train"], - "trivia_qa": ["train"], - "embedding-data/sentence-compression": ["train"], - "embedding-data/flickr30k-captions": ["train"], - "embedding-data/altlex": ["train"], - "embedding-data/simple-wiki": ["train"], - "embedding-data/QQP": ["train"], - "embedding-data/SPECTER": ["train"], - "embedding-data/PAQ_pairs": ["train"], - "embedding-data/WikiAnswers": ["train"], + # source: frontmatter in readme + # trained on stack exchange, unsure if sources match + "StackExchangeClusteringP2P": ["test"], + "StackExchangeClusteringP2P.v2": ["test"], + "StackExchangeClustering": ["test"], + "StackExchangeClustering.v2": ["test"], + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], + # Non MTEB sources + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], }, ) @@ -195,28 +206,39 @@ use_instructions=False, superseded_by=None, adapted_from=None, + public_training_code=False, # does sentence transformer count? + public_training_data=True, training_datasets={ - "s2orc": ["train"], - "flax-sentence-embeddings/stackexchange_xml": ["train"], - "ms_marco": ["train"], - "gooaq": ["train"], - "yahoo_answers_topics": ["train"], - "code_search_net": ["train"], - "search_qa": ["train"], - "eli5": ["train"], - "snli": ["train"], - "multi_nli": ["train"], - "wikihow": ["train"], - "natural_questions": ["train"], - "trivia_qa": ["train"], - "embedding-data/sentence-compression": ["train"], - "embedding-data/flickr30k-captions": ["train"], - "embedding-data/altlex": ["train"], - "embedding-data/simple-wiki": ["train"], - "embedding-data/QQP": ["train"], - "embedding-data/SPECTER": ["train"], - "embedding-data/PAQ_pairs": ["train"], - "embedding-data/WikiAnswers": ["train"], + # source: frontmatter in readme + # trained on stack exchange, unsure if sources match + "StackExchangeClusteringP2P": ["test"], + "StackExchangeClusteringP2P.v2": ["test"], + "StackExchangeClustering": ["test"], + "StackExchangeClustering.v2": ["test"], + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], + # Non MTEB sources + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], }, ) @@ -318,28 +340,39 @@ use_instructions=False, superseded_by=None, adapted_from=None, + public_training_code=False, # does sentence transformer count? + public_training_data=True, training_datasets={ - "s2orc": ["train"], - "flax-sentence-embeddings/stackexchange_xml": ["train"], - "ms_marco": ["train"], - "gooaq": ["train"], - "yahoo_answers_topics": ["train"], - "code_search_net": ["train"], - "search_qa": ["train"], - "eli5": ["train"], - "snli": ["train"], - "multi_nli": ["train"], - "wikihow": ["train"], - "natural_questions": ["train"], - "trivia_qa": ["train"], - "embedding-data/sentence-compression": ["train"], - "embedding-data/flickr30k-captions": ["train"], - "embedding-data/altlex": ["train"], - "embedding-data/simple-wiki": ["train"], - "embedding-data/QQP": ["train"], - "embedding-data/SPECTER": ["train"], - "embedding-data/PAQ_pairs": ["train"], - "embedding-data/WikiAnswers": ["train"], + # source: frontmatter in readme + # trained on stack exchange, unsure if sources match + "StackExchangeClusteringP2P": ["test"], + "StackExchangeClusteringP2P.v2": ["test"], + "StackExchangeClustering": ["test"], + "StackExchangeClustering.v2": ["test"], + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], + # Non MTEB sources + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], }, ) @@ -361,9 +394,12 @@ superseded_by=None, adapted_from=None, training_datasets={ - "sentence-transformers/all-nli": ["train"], - "sentence-transformers/stsb": ["train"], - "sentence-transformers/quora-duplicates": ["train"], - "sentence-transformers/natural-questions": ["train"], + # shource yaml header: + "NQ": ["test"] + # not in MTEB: + # "sentence-transformers/all-nli": ["train"], + # "sentence-transformers/stsb": ["train"], + # "sentence-transformers/quora-duplicates": ["train"], + # "sentence-transformers/natural-questions": ["train"], }, ) diff --git a/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py b/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py index d6bb252304..c411138e9f 100644 --- a/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py +++ b/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py @@ -92,7 +92,6 @@ class StackExchangeClusteringP2P(AbsTaskClustering): eval_langs=["eng-Latn"], main_score="v_measure", date=None, - form=None, domains=None, task_subtypes=None, license=None,