From 3f093c86a5e4bccd31e8a9ed860d1a33bd64b391 Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Sat, 11 Jan 2025 17:41:12 +0100
Subject: [PATCH] fix: added annotations for training data (#1742)

* fix: Added annotations for arctic embed models

* added google and bge

* added cohere

* Added e5

* added bge based model2vec

* annotated oAI

* format and update annotations
---
 mteb/models/arctic_models.py                  | 183 ++++++++++++++++++
 mteb/models/bge_models.py                     |  93 +++++++++
 mteb/models/cohere_models.py                  |  12 ++
 mteb/models/e5_models.py                      | 108 +++++++++++
 mteb/models/google_models.py                  |   9 +
 mteb/models/model2vec_models.py               |  96 ++++++++-
 mteb/models/openai_models.py                  |   9 +
 mteb/models/sentence_transformers_models.py   | 172 +++++++++-------
 .../eng/StackExchangeClusteringP2P.py         |   1 -
 9 files changed, 609 insertions(+), 74 deletions(-)

diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py
index e16de37f74..b4c2b97ac6 100644
--- a/mteb/models/arctic_models.py
+++ b/mteb/models/arctic_models.py
@@ -103,6 +103,32 @@
     use_instructions=True,
     adapted_from="sentence-transformers/all-MiniLM-L6-v2",
     superseded_by=None,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2405.05374
+        # splits not specified to assuming everything
+        # in MTEB
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "HotPotQA": ["test"],
+        "HotPotQAHardNegatives": ["test"],
+        "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "FEVER": ["test"],
+        "FEVERHardNegatives": ["test"],
+        # not in MTEB
+        # trained on stack exchange (title-body)
+        # "stackexchange": [],
+        # potentially means that:
+        # "StackExchangeClusteringP2P": ["test"],
+        # "StackExchangeClusteringP2P.v2": ["test"],
+        # "StackExchangeClustering": ["test"],
+        # "StackExchangeClustering.v2": ["test"],
+        # not in MTEB
+        # "paq": [],
+        # "s2orc": [],
+        # "other": [],  # undisclosed including webdata
+    },  # also use synthetic
 )
 
 
@@ -128,6 +154,32 @@
     use_instructions=True,
     adapted_from="intfloat/e5-small-unsupervised",
     superseded_by=None,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2405.05374
+        # splits not specified to assuming everything
+        # in MTEB
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "HotPotQA": ["test"],
+        "HotPotQAHardNegatives": ["test"],
+        "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "FEVER": ["test"],
+        "FEVERHardNegatives": ["test"],
+        # not in MTEB
+        # trained on stack exchange (title-body)
+        # "stackexchange": [],
+        # potentially means that:
+        # "StackExchangeClusteringP2P": ["test"],
+        # "StackExchangeClusteringP2P.v2": ["test"],
+        # "StackExchangeClustering": ["test"],
+        # "StackExchangeClustering.v2": ["test"],
+        # not in MTEB
+        # "paq": [],
+        # "s2orc": [],
+        # "other": [],  # undisclosed including webdata
+    },  # also use synthetic
 )
 
 
@@ -153,6 +205,32 @@
     use_instructions=True,
     adapted_from="intfloat/e5-base-unsupervised",
     superseded_by="Snowflake/snowflake-arctic-embed-m-v1.5",
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2405.05374
+        # splits not specified to assuming everything
+        # in MTEB
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "HotPotQA": ["test"],
+        "HotPotQAHardNegatives": ["test"],
+        "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "FEVER": ["test"],
+        "FEVERHardNegatives": ["test"],
+        # not in MTEB
+        # trained on stack exchange (title-body)
+        # "stackexchange": [],
+        # potentially means that:
+        # "StackExchangeClusteringP2P": ["test"],
+        # "StackExchangeClusteringP2P.v2": ["test"],
+        # "StackExchangeClustering": ["test"],
+        # "StackExchangeClustering.v2": ["test"],
+        # not in MTEB
+        # "paq": [],
+        # "s2orc": [],
+        # "other": [],  # undisclosed including webdata
+    },  # also use synthetic
 )
 
 arctic_embed_m_long = ModelMeta(
@@ -178,6 +256,33 @@
     use_instructions=True,
     adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised",
     superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0",
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2405.05374
+        # splits not specified to assuming everything
+        # in MTEB
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "HotPotQA": ["test"],
+        "HotPotQAHardNegatives": ["test"],
+        "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "FEVER": ["test"],
+        "FEVERHardNegatives": ["test"],
+        # trained on stack exchange, unsure if sources match
+        # not in MTEB
+        # trained on stack exchange (title-body)
+        # "stackexchange": [],
+        # potentially means that:
+        # "StackExchangeClusteringP2P": ["test"],
+        # "StackExchangeClusteringP2P.v2": ["test"],
+        # "StackExchangeClustering": ["test"],
+        # "StackExchangeClustering.v2": ["test"],
+        # not in MTEB
+        # "paq": [],
+        # "s2orc": [],
+        # "other": [],  # undisclosed including webdata
+    },  # also use synthetic
 )
 
 arctic_embed_l = ModelMeta(
@@ -202,6 +307,32 @@
     use_instructions=True,
     adapted_from="intfloat/e5-base-unsupervised",
     superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0",
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2405.05374
+        # splits not specified to assuming everything
+        # in MTEB
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "HotPotQA": ["test"],
+        "HotPotQAHardNegatives": ["test"],
+        "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "FEVER": ["test"],
+        "FEVERHardNegatives": ["test"],
+        # not in MTEB
+        # trained on stack exchange (title-body)
+        # "stackexchange": [],
+        # potentially means that:
+        # "StackExchangeClusteringP2P": ["test"],
+        # "StackExchangeClusteringP2P.v2": ["test"],
+        # "StackExchangeClustering": ["test"],
+        # "StackExchangeClustering.v2": ["test"],
+        # not in MTEB
+        # "paq": [],
+        # "s2orc": [],
+        # "other": [],  # undisclosed including webdata
+    },  # also use synthetic
 )
 
 arctic_embed_m_v1_5 = ModelMeta(
@@ -254,6 +385,32 @@
     use_instructions=True,
     adapted_from="Alibaba-NLP/gte-multilingual-base",
     superseded_by=None,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2405.05374
+        # splits not specified to assuming everything
+        # in MTEB
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "HotPotQA": ["test"],
+        "HotPotQAHardNegatives": ["test"],
+        "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "FEVER": ["test"],
+        "FEVERHardNegatives": ["test"],
+        # not in MTEB
+        # trained on stack exchange (title-body)
+        # "stackexchange": [],
+        # potentially means that:
+        # "StackExchangeClusteringP2P": ["test"],
+        # "StackExchangeClusteringP2P.v2": ["test"],
+        # "StackExchangeClustering": ["test"],
+        # "StackExchangeClustering.v2": ["test"],
+        # not in MTEB
+        # "paq": [],
+        # "s2orc": [],
+        # "other": [],  # undisclosed including webdata
+    },  # also use synthetic
 )
 
 arctic_embed_l_v2_0 = ModelMeta(
@@ -278,4 +435,30 @@
     use_instructions=True,
     adapted_from="BAAI/bge-m3-retromae",
     superseded_by=None,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2405.05374
+        # splits not specified to assuming everything
+        # in MTEB
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "HotPotQA": ["test"],
+        "HotPotQAHardNegatives": ["test"],
+        "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "FEVER": ["test"],
+        "FEVERHardNegatives": ["test"],
+        # not in MTEB
+        # trained on stack exchange (title-body)
+        # "stackexchange": [],
+        # potentially means that:
+        # "StackExchangeClusteringP2P": ["test"],
+        # "StackExchangeClusteringP2P.v2": ["test"],
+        # "StackExchangeClustering": ["test"],
+        # "StackExchangeClustering.v2": ["test"],
+        # not in MTEB
+        # "paq": [],
+        # "s2orc": [],
+        # "other": [],  # undisclosed including webdata
+    },  # also use synthetic
 )
diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py
index cc183374c6..276d28526f 100644
--- a/mteb/models/bge_models.py
+++ b/mteb/models/bge_models.py
@@ -27,6 +27,37 @@
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
+    public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
+    public_training_code=None,  # seemingly released (at least for some models, but the link is broken
+    training_datasets={
+        # source: https://data.baai.ac.cn/details/BAAI-MTP
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "AmazonReviewsClassification": [
+            "validation",
+            "test",
+        ],  # assumed from: amazon_reviews_multi
+        "MLQARetrieval": [
+            "validation",
+            "test",
+        ],  # assumed from mlqa	(question, context)
+        # not in mteb
+        # Dataset	Pairs
+        # wudao	(title, passage)
+        # cmrc2018	(query, context)
+        # dureader	(query, context)
+        # simclue	(sentence_a, sentence_b)
+        # csl	(title, abstract)
+        # amazon_reviews_multi	(title, body)
+        # wiki_atomic_edits	(base_sentence, edited_sentence)
+        # mlqa	(question, context)
+        # xlsum	(title, summary) (title, text)
+        # "sentence-transformers data": [],  # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
+        # "wikipedia": [],  # title + section title, passage
+        # "reddit": [],  # title, body
+        # "stackexchange": [],  # (title, upvoted answer) (title+body, upvoted answer)
+        # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
+    },
 )
 
 bge_base_en_v1_5 = ModelMeta(
@@ -50,6 +81,37 @@
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
+    public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
+    public_training_code=None,  # seemingly released (at least for some models, but the link is broken
+    training_datasets={
+        # source: https://data.baai.ac.cn/details/BAAI-MTP
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "AmazonReviewsClassification": [
+            "validation",
+            "test",
+        ],  # assumed from: amazon_reviews_multi
+        "MLQARetrieval": [
+            "validation",
+            "test",
+        ],  # assumed from mlqa	(question, context)
+        # not in mteb
+        # Dataset	Pairs
+        # wudao	(title, passage)
+        # cmrc2018	(query, context)
+        # dureader	(query, context)
+        # simclue	(sentence_a, sentence_b)
+        # csl	(title, abstract)
+        # amazon_reviews_multi	(title, body)
+        # wiki_atomic_edits	(base_sentence, edited_sentence)
+        # mlqa	(question, context)
+        # xlsum	(title, summary) (title, text)
+        # "sentence-transformers data": [],  # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
+        # "wikipedia": [],  # title + section title, passage
+        # "reddit": [],  # title, body
+        # "stackexchange": [],  # (title, upvoted answer) (title+body, upvoted answer)
+        # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
+    },
 )
 
 bge_large_en_v1_5 = ModelMeta(
@@ -73,4 +135,35 @@
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
+    public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
+    public_training_code=None,  # seemingly released (at least for some models, but the link is broken
+    training_datasets={
+        # source: https://data.baai.ac.cn/details/BAAI-MTP
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "AmazonReviewsClassification": [
+            "validation",
+            "test",
+        ],  # assumed from: amazon_reviews_multi
+        "MLQARetrieval": [
+            "validation",
+            "test",
+        ],  # assumed from mlqa	(question, context)
+        # not in mteb
+        # Dataset	Pairs
+        # wudao	(title, passage)
+        # cmrc2018	(query, context)
+        # dureader	(query, context)
+        # simclue	(sentence_a, sentence_b)
+        # csl	(title, abstract)
+        # amazon_reviews_multi	(title, body)
+        # wiki_atomic_edits	(base_sentence, edited_sentence)
+        # mlqa	(question, context)
+        # xlsum	(title, summary) (title, text)
+        # "sentence-transformers data": [],  # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
+        # "wikipedia": [],  # title + section title, passage
+        # "reddit": [],  # title, body
+        # "stackexchange": [],  # (title, upvoted answer) (title+body, upvoted answer)
+        # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
+    },
 )
diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py
index 43a797342d..4b34045f89 100644
--- a/mteb/models/cohere_models.py
+++ b/mteb/models/cohere_models.py
@@ -235,6 +235,9 @@ def encode(
     similarity_fn_name="cosine",
     framework=["API"],
     use_instructions=True,
+    public_training_data=False,  # assumed
+    public_training_code=False,  # assumed
+    training_datasets=None,
 )
 
 cohere_eng_3 = ModelMeta(
@@ -257,6 +260,9 @@ def encode(
     similarity_fn_name="cosine",
     framework=["API"],
     use_instructions=True,
+    public_training_data=False,  # assumed
+    public_training_code=False,  # assumed
+    training_datasets=None,
 )
 
 cohere_mult_light_3 = ModelMeta(
@@ -279,6 +285,9 @@ def encode(
     similarity_fn_name="cosine",
     framework=["API"],
     use_instructions=True,
+    public_training_data=False,  # assumed
+    public_training_code=False,  # assumed
+    training_datasets=None,
 )
 
 cohere_eng_light_3 = ModelMeta(
@@ -301,4 +310,7 @@ def encode(
     similarity_fn_name="cosine",
     framework=["API"],
     use_instructions=True,
+    public_training_data=False,  # assumed
+    public_training_code=False,  # assumed
+    training_datasets=None,
 )
diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py
index 4fee54de79..97b117002b 100644
--- a/mteb/models/e5_models.py
+++ b/mteb/models/e5_models.py
@@ -134,6 +134,26 @@
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2212.03533
+        # table 1:
+        # Wikipedia 150M
+        # mC4 160M
+        # Multilingual CC News 160M
+        # NLLB 160M
+        # Reddit 160M
+        # S2ORC 50M
+        # Stackexchange 50M
+        # xP3 80M
+        # Misc. SBERT Data 10M
+        # ----
+        # from Misc. SBERT Data 10M:
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "MSMARCO": ["train"],  # dev?
+    },
 )
 
 e5_mult_base = ModelMeta(
@@ -156,6 +176,26 @@
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2402.05672
+        # table 1:
+        # Wikipedia 150M
+        # mC4 160M
+        # Multilingual CC News 160M
+        # NLLB 160M
+        # Reddit 160M
+        # S2ORC 50M
+        # Stackexchange 50M
+        # xP3 80M
+        # Misc. SBERT Data 10M
+        # ----
+        # from Misc. SBERT Data 10M:
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "MSMARCO": ["train"],  # dev?
+    },
 )
 
 e5_mult_large = ModelMeta(
@@ -179,6 +219,26 @@
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2402.05672
+        # table 1:
+        # Wikipedia 150M
+        # mC4 160M
+        # Multilingual CC News 160M
+        # NLLB 160M
+        # Reddit 160M
+        # S2ORC 50M
+        # Stackexchange 50M
+        # xP3 80M
+        # Misc. SBERT Data 10M
+        # ----
+        # from Misc. SBERT Data 10M:
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "MSMARCO": ["train"],  # dev?
+    },
 )
 
 e5_eng_small_v2 = ModelMeta(
@@ -201,6 +261,14 @@
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2212.03533
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "MSMARCO": ["train"],  # dev?
+    },
 )
 
 e5_eng_small = ModelMeta(
@@ -224,6 +292,14 @@
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=True,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2212.03533
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "MSMARCO": ["train"],  # dev?
+    },
 )
 
 e5_eng_base_v2 = ModelMeta(
@@ -249,6 +325,14 @@
     use_instructions=True,
     superseded_by=None,
     adapted_from=None,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2212.03533
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "MSMARCO": ["train"],  # dev?
+    },
 )
 
 e5_eng_large_v2 = ModelMeta(
@@ -274,6 +358,14 @@
     use_instructions=True,
     superseded_by=None,
     adapted_from=None,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2212.03533
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "MSMARCO": ["train"],  # dev?
+    },
 )
 
 e5_large = ModelMeta(
@@ -299,6 +391,14 @@
     use_instructions=True,
     superseded_by="intfloat/e5-large-v2",
     adapted_from=None,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2212.03533
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "MSMARCO": ["train"],  # dev?
+    },
 )
 
 e5_base = ModelMeta(
@@ -324,4 +424,12 @@
     use_instructions=True,
     superseded_by="intfloat/e5-base-v2",
     adapted_from=None,
+    public_training_data=False,  # couldn't find
+    public_training_code=False,  # couldn't find
+    training_datasets={
+        # source: https://arxiv.org/pdf/2212.03533
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "MSMARCO": ["train"],  # dev?
+    },
 )
diff --git a/mteb/models/google_models.py b/mteb/models/google_models.py
index 4fcd21ae6e..1b4a4a13ff 100644
--- a/mteb/models/google_models.py
+++ b/mteb/models/google_models.py
@@ -152,6 +152,9 @@ def encode(
     similarity_fn_name="cosine",  # assumed
     framework=["API"],
     use_instructions=True,
+    public_training_data=False,  # assumed
+    public_training_code=False,  # assumed
+    training_datasets=None,
 )
 
 google_text_emb_005 = ModelMeta(
@@ -173,6 +176,9 @@ def encode(
     similarity_fn_name="cosine",  # assumed
     framework=["API"],
     use_instructions=True,
+    public_training_data=False,  # assumed
+    public_training_code=False,  # assumed
+    training_datasets=None,
 )
 
 google_text_multilingual_emb_002 = ModelMeta(
@@ -194,4 +200,7 @@ def encode(
     similarity_fn_name="cosine",  # assumed
     framework=["API"],
     use_instructions=True,
+    public_training_data=False,  # assumed
+    public_training_code=False,  # assumed
+    training_datasets=None,
 )
diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py
index 1541d3ca3d..37da533457 100644
--- a/mteb/models/model2vec_models.py
+++ b/mteb/models/model2vec_models.py
@@ -27,7 +27,7 @@ def __init__(
             **kwargs: Additional arguments to pass to the wrapper.
         """
         try:
-            from model2vec import StaticModel
+            from model2vec import StaticModel  # type: ignore
         except ModuleNotFoundError as e:
             raise ModuleNotFoundError(
                 "To use the Model2Vec models `model2vec` is required. Please install it with `pip install mteb[model2vec]`."
@@ -63,7 +63,7 @@ def encode(
     open_weights=True,
     revision="5f4f5ca159b7321a8b39739bba0794fa0debddf4",
     release_date="2024-09-21",
-    n_parameters=103 * 1e6,
+    n_parameters=int(103 * 1e6),
     max_tokens=np.inf,  # Theoretically infinite
     embed_dim=256,
     license="mit",
@@ -72,6 +72,20 @@ def encode(
     reference="https://huggingface.co/minishlab/M2V_base_glove_subword",
     use_instructions=False,
     adapted_from="BAAI/bge-base-en-v1.5",
+    public_training_data=True,
+    public_training_code=None,  # distilled model
+    training_datasets={  # same as adapted from
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "AmazonReviewsClassification": [
+            "validation",
+            "test",
+        ],
+        "MLQARetrieval": [
+            "validation",
+            "test",
+        ],
+    },
     superseded_by=None,
 )
 
@@ -86,7 +100,7 @@ def encode(
     open_weights=True,
     revision="38ebd7f10f71e67fa8db898290f92b82e9cfff2b",
     release_date="2024-09-21",
-    n_parameters=102 * 1e6,
+    n_parameters=int(102 * 1e6),
     max_tokens=np.inf,
     embed_dim=256,
     license="mit",
@@ -96,6 +110,20 @@ def encode(
     use_instructions=False,
     adapted_from="BAAI/bge-base-en-v1.5",
     superseded_by=None,
+    public_training_data=True,
+    public_training_code=None,  # distilled model
+    training_datasets={  # same as adapted from
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "AmazonReviewsClassification": [
+            "validation",
+            "test",
+        ],
+        "MLQARetrieval": [
+            "validation",
+            "test",
+        ],
+    },
 )
 
 m2v_base_output = ModelMeta(
@@ -108,7 +136,7 @@ def encode(
     open_weights=True,
     revision="02460ae401a22b09d2c6652e23371398329551e2",
     release_date="2024-09-21",
-    n_parameters=7.56 * 1e6,
+    n_parameters=int(7.56 * 1e6),
     max_tokens=np.inf,
     embed_dim=256,
     license="mit",
@@ -118,6 +146,20 @@ def encode(
     use_instructions=False,
     adapted_from="BAAI/bge-base-en-v1.5",
     superseded_by=None,
+    public_training_data=True,
+    public_training_code=None,  # distilled model
+    training_datasets={  # same as adapted from
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "AmazonReviewsClassification": [
+            "validation",
+            "test",
+        ],
+        "MLQARetrieval": [
+            "validation",
+            "test",
+        ],
+    },
 )
 
 m2v_multilingual_output = ModelMeta(
@@ -130,7 +172,7 @@ def encode(
     open_weights=True,
     revision="2cf4ec4e1f51aeca6c55cf9b93097d00711a6305",
     release_date="2024-09-21",
-    n_parameters=128 * 1e6,
+    n_parameters=int(128 * 1e6),
     max_tokens=np.inf,
     embed_dim=256,
     license="mit",
@@ -140,6 +182,8 @@ def encode(
     use_instructions=False,
     adapted_from="sentence-transformers/LaBSE",
     superseded_by=None,
+    public_training_data=True,
+    public_training_code=None,  # distilled model
 )
 
 potion_base_2m = ModelMeta(
@@ -162,6 +206,20 @@ def encode(
     use_instructions=False,
     adapted_from="BAAI/bge-base-en-v1.5",
     superseded_by=None,
+    public_training_data=True,
+    public_training_code=None,  # distilled model
+    training_datasets={  # same as adapted from
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "AmazonReviewsClassification": [
+            "validation",
+            "test",
+        ],
+        "MLQARetrieval": [
+            "validation",
+            "test",
+        ],
+    },
 )
 
 potion_base_4m = ModelMeta(
@@ -184,6 +242,20 @@ def encode(
     use_instructions=False,
     adapted_from="BAAI/bge-base-en-v1.5",
     superseded_by=None,
+    public_training_data=True,
+    public_training_code=None,  # distilled model
+    training_datasets={  # same as adapted from
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "AmazonReviewsClassification": [
+            "validation",
+            "test",
+        ],
+        "MLQARetrieval": [
+            "validation",
+            "test",
+        ],
+    },
 )
 
 potion_base_8m = ModelMeta(
@@ -206,4 +278,18 @@ def encode(
     use_instructions=False,
     adapted_from="BAAI/bge-base-en-v1.5",
     superseded_by=None,
+    public_training_data=True,
+    public_training_code=None,  # distilled model
+    training_datasets={  # same as adapted from
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "AmazonReviewsClassification": [
+            "validation",
+            "test",
+        ],
+        "MLQARetrieval": [
+            "validation",
+            "test",
+        ],
+    },
 )
diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py
index aecacf549a..c187bfa317 100644
--- a/mteb/models/openai_models.py
+++ b/mteb/models/openai_models.py
@@ -136,6 +136,9 @@ def _to_numpy(self, embedding_response) -> np.ndarray:
     similarity_fn_name="cosine",
     framework=["API"],
     use_instructions=False,
+    public_training_data=False,  # assumed
+    public_training_code=False,  # assumed
+    training_datasets=None,
 )
 text_embedding_3_large = ModelMeta(
     name="openai/text-embedding-3-large",
@@ -156,6 +159,9 @@ def _to_numpy(self, embedding_response) -> np.ndarray:
     use_instructions=False,
     n_parameters=None,
     memory_usage=None,
+    public_training_data=False,  # assumed
+    public_training_code=False,  # assumed
+    training_datasets=None,
 )
 text_embedding_ada_002 = ModelMeta(
     name="openai/text-embedding-ada-002",
@@ -176,4 +182,7 @@ def _to_numpy(self, embedding_response) -> np.ndarray:
     use_instructions=False,
     n_parameters=None,
     memory_usage=None,
+    public_training_data=False,  # assumed
+    public_training_code=False,  # assumed
+    training_datasets=None,
 )
diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py
index ea02508c36..28349d60d9 100644
--- a/mteb/models/sentence_transformers_models.py
+++ b/mteb/models/sentence_transformers_models.py
@@ -71,34 +71,45 @@
     embed_dim=384,
     license="apache-2.0",
     max_tokens=256,
-    reference="https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+    reference="https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2",
     similarity_fn_name="cosine",
     framework=["Sentence Transformers", "PyTorch"],
     use_instructions=False,
     superseded_by=None,
     adapted_from=None,
+    public_training_code=False,  # does sentence transformer count?
+    public_training_data=True,
     training_datasets={
-        "s2orc": ["train"],
-        "flax-sentence-embeddings/stackexchange_xml": ["train"],
-        "ms_marco": ["train"],
-        "gooaq": ["train"],
-        "yahoo_answers_topics": ["train"],
-        "code_search_net": ["train"],
-        "search_qa": ["train"],
-        "eli5": ["train"],
-        "snli": ["train"],
-        "multi_nli": ["train"],
-        "wikihow": ["train"],
-        "natural_questions": ["train"],
-        "trivia_qa": ["train"],
-        "embedding-data/sentence-compression": ["train"],
-        "embedding-data/flickr30k-captions": ["train"],
-        "embedding-data/altlex": ["train"],
-        "embedding-data/simple-wiki": ["train"],
-        "embedding-data/QQP": ["train"],
-        "embedding-data/SPECTER": ["train"],
-        "embedding-data/PAQ_pairs": ["train"],
-        "embedding-data/WikiAnswers": ["train"],
+        # source: frontmatter in readme
+        # trained on stack exchange, unsure if sources match
+        "StackExchangeClusteringP2P": ["test"],
+        "StackExchangeClusteringP2P.v2": ["test"],
+        "StackExchangeClustering": ["test"],
+        "StackExchangeClustering.v2": ["test"],
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "MSMARCO": ["train"],
+        # Non MTEB sources
+        # "s2orc": ["train"],
+        # "flax-sentence-embeddings/stackexchange_xml": ["train"],
+        # "ms_marco": ["train"],
+        # "gooaq": ["train"],
+        # "yahoo_answers_topics": ["train"],
+        # "code_search_net": ["train"],
+        # "search_qa": ["train"],
+        # "eli5": ["train"],
+        # "snli": ["train"],
+        # "multi_nli": ["train"],
+        # "wikihow": ["train"],
+        # "trivia_qa": ["train"],
+        # "embedding-data/sentence-compression": ["train"],
+        # "embedding-data/flickr30k-captions": ["train"],
+        # "embedding-data/altlex": ["train"],
+        # "embedding-data/simple-wiki": ["train"],
+        # "embedding-data/QQP": ["train"],
+        # "embedding-data/SPECTER": ["train"],
+        # "embedding-data/PAQ_pairs": ["train"],
+        # "embedding-data/WikiAnswers": ["train"],
     },
 )
 
@@ -195,28 +206,39 @@
     use_instructions=False,
     superseded_by=None,
     adapted_from=None,
+    public_training_code=False,  # does sentence transformer count?
+    public_training_data=True,
     training_datasets={
-        "s2orc": ["train"],
-        "flax-sentence-embeddings/stackexchange_xml": ["train"],
-        "ms_marco": ["train"],
-        "gooaq": ["train"],
-        "yahoo_answers_topics": ["train"],
-        "code_search_net": ["train"],
-        "search_qa": ["train"],
-        "eli5": ["train"],
-        "snli": ["train"],
-        "multi_nli": ["train"],
-        "wikihow": ["train"],
-        "natural_questions": ["train"],
-        "trivia_qa": ["train"],
-        "embedding-data/sentence-compression": ["train"],
-        "embedding-data/flickr30k-captions": ["train"],
-        "embedding-data/altlex": ["train"],
-        "embedding-data/simple-wiki": ["train"],
-        "embedding-data/QQP": ["train"],
-        "embedding-data/SPECTER": ["train"],
-        "embedding-data/PAQ_pairs": ["train"],
-        "embedding-data/WikiAnswers": ["train"],
+        # source: frontmatter in readme
+        # trained on stack exchange, unsure if sources match
+        "StackExchangeClusteringP2P": ["test"],
+        "StackExchangeClusteringP2P.v2": ["test"],
+        "StackExchangeClustering": ["test"],
+        "StackExchangeClustering.v2": ["test"],
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "MSMARCO": ["train"],
+        # Non MTEB sources
+        # "s2orc": ["train"],
+        # "flax-sentence-embeddings/stackexchange_xml": ["train"],
+        # "ms_marco": ["train"],
+        # "gooaq": ["train"],
+        # "yahoo_answers_topics": ["train"],
+        # "code_search_net": ["train"],
+        # "search_qa": ["train"],
+        # "eli5": ["train"],
+        # "snli": ["train"],
+        # "multi_nli": ["train"],
+        # "wikihow": ["train"],
+        # "trivia_qa": ["train"],
+        # "embedding-data/sentence-compression": ["train"],
+        # "embedding-data/flickr30k-captions": ["train"],
+        # "embedding-data/altlex": ["train"],
+        # "embedding-data/simple-wiki": ["train"],
+        # "embedding-data/QQP": ["train"],
+        # "embedding-data/SPECTER": ["train"],
+        # "embedding-data/PAQ_pairs": ["train"],
+        # "embedding-data/WikiAnswers": ["train"],
     },
 )
 
@@ -318,28 +340,39 @@
     use_instructions=False,
     superseded_by=None,
     adapted_from=None,
+    public_training_code=False,  # does sentence transformer count?
+    public_training_data=True,
     training_datasets={
-        "s2orc": ["train"],
-        "flax-sentence-embeddings/stackexchange_xml": ["train"],
-        "ms_marco": ["train"],
-        "gooaq": ["train"],
-        "yahoo_answers_topics": ["train"],
-        "code_search_net": ["train"],
-        "search_qa": ["train"],
-        "eli5": ["train"],
-        "snli": ["train"],
-        "multi_nli": ["train"],
-        "wikihow": ["train"],
-        "natural_questions": ["train"],
-        "trivia_qa": ["train"],
-        "embedding-data/sentence-compression": ["train"],
-        "embedding-data/flickr30k-captions": ["train"],
-        "embedding-data/altlex": ["train"],
-        "embedding-data/simple-wiki": ["train"],
-        "embedding-data/QQP": ["train"],
-        "embedding-data/SPECTER": ["train"],
-        "embedding-data/PAQ_pairs": ["train"],
-        "embedding-data/WikiAnswers": ["train"],
+        # source: frontmatter in readme
+        # trained on stack exchange, unsure if sources match
+        "StackExchangeClusteringP2P": ["test"],
+        "StackExchangeClusteringP2P.v2": ["test"],
+        "StackExchangeClustering": ["test"],
+        "StackExchangeClustering.v2": ["test"],
+        "NQ": ["test"],
+        "NQHardNegatives": ["test"],
+        "MSMARCO": ["train"],
+        # Non MTEB sources
+        # "s2orc": ["train"],
+        # "flax-sentence-embeddings/stackexchange_xml": ["train"],
+        # "ms_marco": ["train"],
+        # "gooaq": ["train"],
+        # "yahoo_answers_topics": ["train"],
+        # "code_search_net": ["train"],
+        # "search_qa": ["train"],
+        # "eli5": ["train"],
+        # "snli": ["train"],
+        # "multi_nli": ["train"],
+        # "wikihow": ["train"],
+        # "trivia_qa": ["train"],
+        # "embedding-data/sentence-compression": ["train"],
+        # "embedding-data/flickr30k-captions": ["train"],
+        # "embedding-data/altlex": ["train"],
+        # "embedding-data/simple-wiki": ["train"],
+        # "embedding-data/QQP": ["train"],
+        # "embedding-data/SPECTER": ["train"],
+        # "embedding-data/PAQ_pairs": ["train"],
+        # "embedding-data/WikiAnswers": ["train"],
     },
 )
 
@@ -361,9 +394,12 @@
     superseded_by=None,
     adapted_from=None,
     training_datasets={
-        "sentence-transformers/all-nli": ["train"],
-        "sentence-transformers/stsb": ["train"],
-        "sentence-transformers/quora-duplicates": ["train"],
-        "sentence-transformers/natural-questions": ["train"],
+        # shource yaml header:
+        "NQ": ["test"]
+        # not in MTEB:
+        # "sentence-transformers/all-nli": ["train"],
+        # "sentence-transformers/stsb": ["train"],
+        # "sentence-transformers/quora-duplicates": ["train"],
+        # "sentence-transformers/natural-questions": ["train"],
     },
 )
diff --git a/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py b/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py
index d6bb252304..c411138e9f 100644
--- a/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py
+++ b/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py
@@ -92,7 +92,6 @@ class StackExchangeClusteringP2P(AbsTaskClustering):
         eval_langs=["eng-Latn"],
         main_score="v_measure",
         date=None,
-        form=None,
         domains=None,
         task_subtypes=None,
         license=None,