sbintuitions · lsz05 · Aug 20, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -21,7 +21,7 @@ jobs:
   run-tests:
     runs-on: ubuntu-latest
     env:
-      PYTHON_VERSION: "3.9"
+      PYTHON_VERSION: "3.10"
       NO_CACHE: ${{ github.event.inputs.no-cache || 'false' }}
     steps:
       - name: Checkout
@@ -53,7 +53,7 @@ jobs:
   lint_check:
     runs-on: ubuntu-latest
     env:
-      PYTHON_VERSION: "3.9"
+      PYTHON_VERSION: "3.10"
     steps:
       - uses: actions/checkout@v3
 

diff --git a/.markdownlint.yaml b/.markdownlint.yaml
@@ -1,3 +1,4 @@
 MD013: false
 MD040: false
-MD025: false
+MD025: false
+MD028: false
diff --git a/README.md b/README.md
@@ -4,6 +4,8 @@
 
 This is an easy-to-use evaluation script designed for JMTEB evaluation.
 
+JMTEB leaderboard is [here](leaderboard.md). A guidance for submission is coming soon.
+
 ## Quick start
 
 ```bash
@@ -38,4 +40,40 @@ poetry run python -m jmteb \
 ```
 
 > [!NOTE]
-> Some tasks (e.g., AmazonReviewClassification in classification, JAQKET and Mr.TyDi-ja in retrieval, esci in reranking) are time-consuming and memory-consuming. Heavy retrieval tasks take hours to encode the large corpus, and use much memory for the storage of such vectors. If you want to exclude them, add `--eval_exclude "['amazon_review_classification', 'mrtydi', 'jaqket', 'esci']"`.
+> Some tasks (e.g., AmazonReviewClassification in classification, JAQKET and Mr.TyDi-ja in retrieval, esci in reranking) are time-consuming and memory-consuming. Heavy retrieval tasks take hours to encode the large corpus, and use much memory for the storage of such vectors. If you want to exclude them, add `--eval_exclude "['amazon_review_classification', 'mrtydi', 'jaqket', 'esci']"`. Similarly, you can also use `--eval_include` to include only evaluation datasets you want.
+
+> [!NOTE]
+> If you want to log model predictions to further analyze the performance of your model, you may want to use `--log_predictions true` to enable all evaluators to log predictions. It is also available to set whether to log in the config of evaluators.
+
+## Multi-GPU support
+
+There are two ways to enable multi-GPU evaluation.
+
+* New class `DPSentenceBertEmbedder` ([here](src/jmteb/embedders/data_parallel_sbert_embedder.py)).
+
+```bash
+poetry run python -m jmteb \
+  --evaluators "src/configs/tasks/jsts.jsonnet" \
+  --embedder DPSentenceBertEmbedder \
+  --embedder.model_name_or_path "<model_name_or_path>" \
+  --save_dir "output/<model_name_or_path>"
+```
+
+* With `torchrun`, multi-GPU in [`TransformersEmbedder`](src/jmteb/embedders/transformers_embedder.py) is available. For example,
+
+```bash
+MODEL_NAME=<model_name_or_path>
+MODEL_KWARGS="\{\'torch_dtype\':\'torch.bfloat16\'\}"
+torchrun \
+    --nproc_per_node=$GPUS_PER_NODE --nnodes=1 \
+    src/jmteb/__main__.py --embedder TransformersEmbedder \
+    --embedder.model_name_or_path ${MODEL_NAME} \
+    --embedder.pooling_mode cls \
+    --embedder.batch_size 4096 \
+    --embedder.model_kwargs ${MODEL_KWARGS} \
+    --embedder.max_seq_length 512 \
+    --save_dir "output/${MODEL_NAME}" \
+    --evaluators src/jmteb/configs/jmteb.jsonnet
+```
+
+Note that the batch size here is global batch size (`per_device_batch_size` × `n_gpu`).
diff --git a/docs/results/MU-Kindai/Japanese-DiffCSE-BERT-base/summary.json b/docs/results/MU-Kindai/Japanese-DiffCSE-BERT-base/summary.json
@@ -0,0 +1,62 @@
+{
+    "Classification": {
+        "amazon_counterfactual_classification": {
+            "macro_f1": 0.7809527709426081
+        },
+        "amazon_review_classification": {
+            "macro_f1": 0.5155899232320224
+        },
+        "massive_intent_classification": {
+            "macro_f1": 0.7879373479249787
+        },
+        "massive_scenario_classification": {
+            "macro_f1": 0.8662625888023707
+        }
+    },
+    "Reranking": {
+        "esci": {
+            "ndcg@10": 0.9095168116460639
+        }
+    },
+    "Retrieval": {
+        "jagovfaqs_22k": {
+            "ndcg@10": 0.42314124780036416
+        },
+        "jaqket": {
+            "ndcg@10": 0.36199154051747723
+        },
+        "mrtydi": {
+            "ndcg@10": 0.07810683176415421
+        },
+        "nlp_journal_abs_intro": {
+            "ndcg@10": 0.6077212544951452
+        },
+        "nlp_journal_title_abs": {
+            "ndcg@10": 0.6433890489201118
+        },
+        "nlp_journal_title_intro": {
+            "ndcg@10": 0.39317174536190913
+        }
+    },
+    "STS": {
+        "jsick": {
+            "spearman": 0.754165277432144
+        },
+        "jsts": {
+            "spearman": 0.7558202366183716
+        }
+    },
+    "Clustering": {
+        "livedoor_news": {
+            "v_measure_score": 0.4966545453348478
+        },
+        "mewsc16": {
+            "v_measure_score": 0.3877356318022785
+        }
+    },
+    "PairClassification": {
+        "paws_x_ja": {
+            "binary_f1": 0.6237623762376237
+        }
+    }
+}
diff --git a/docs/results/MU-Kindai/Japanese-MixCSE-BERT-base/summary.json b/docs/results/MU-Kindai/Japanese-MixCSE-BERT-base/summary.json
@@ -0,0 +1,62 @@
+{
+    "Classification": {
+        "amazon_counterfactual_classification": {
+            "macro_f1": 0.776174162517931
+        },
+        "amazon_review_classification": {
+            "macro_f1": 0.5085781180553806
+        },
+        "massive_intent_classification": {
+            "macro_f1": 0.7718541530739129
+        },
+        "massive_scenario_classification": {
+            "macro_f1": 0.8592571786794985
+        }
+    },
+    "Reranking": {
+        "esci": {
+            "ndcg@10": 0.9100551950168166
+        }
+    },
+    "Retrieval": {
+        "jagovfaqs_22k": {
+            "ndcg@10": 0.42368135774043536
+        },
+        "jaqket": {
+            "ndcg@10": 0.37721850397542034
+        },
+        "mrtydi": {
+            "ndcg@10": 0.07878085186566607
+        },
+        "nlp_journal_abs_intro": {
+            "ndcg@10": 0.636999375405723
+        },
+        "nlp_journal_title_abs": {
+            "ndcg@10": 0.6413498649875696
+        },
+        "nlp_journal_title_intro": {
+            "ndcg@10": 0.397250919496823
+        }
+    },
+    "STS": {
+        "jsick": {
+            "spearman": 0.7756925231422259
+        },
+        "jsts": {
+            "spearman": 0.7652968548841591
+        }
+    },
+    "Clustering": {
+        "livedoor_news": {
+            "v_measure_score": 0.5262387436934941
+        },
+        "mewsc16": {
+            "v_measure_score": 0.37277574537292835
+        }
+    },
+    "PairClassification": {
+        "paws_x_ja": {
+            "binary_f1": 0.623321554770318
+        }
+    }
+}
diff --git a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-sup/summary.json b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-sup/summary.json
@@ -0,0 +1,62 @@
+{
+    "Classification": {
+        "amazon_counterfactual_classification": {
+            "macro_f1": 0.7619809437515043
+        },
+        "amazon_review_classification": {
+            "macro_f1": 0.5205592432502059
+        },
+        "massive_intent_classification": {
+            "macro_f1": 0.7789367871593064
+        },
+        "massive_scenario_classification": {
+            "macro_f1": 0.8490320705866646
+        }
+    },
+    "Reranking": {
+        "esci": {
+            "ndcg@10": 0.9065584234991577
+        }
+    },
+    "Retrieval": {
+        "jagovfaqs_22k": {
+            "ndcg@10": 0.4411487123884245
+        },
+        "jaqket": {
+            "ndcg@10": 0.39613283459361814
+        },
+        "mrtydi": {
+            "ndcg@10": 0.08154879873415645
+        },
+        "nlp_journal_abs_intro": {
+            "ndcg@10": 0.6276035246534508
+        },
+        "nlp_journal_title_abs": {
+            "ndcg@10": 0.5838785018803183
+        },
+        "nlp_journal_title_intro": {
+            "ndcg@10": 0.3489329387182086
+        }
+    },
+    "STS": {
+        "jsick": {
+            "spearman": 0.7463567093877269
+        },
+        "jsts": {
+            "spearman": 0.7468283806971927
+        }
+    },
+    "Clustering": {
+        "livedoor_news": {
+            "v_measure_score": 0.41041888940251137
+        },
+        "mewsc16": {
+            "v_measure_score": 0.45175891401665724
+        }
+    },
+    "PairClassification": {
+        "paws_x_ja": {
+            "binary_f1": 0.6236711552090717
+        }
+    }
+}
diff --git a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-unsup/summary.json b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-unsup/summary.json
@@ -0,0 +1,62 @@
+{
+    "Classification": {
+        "amazon_counterfactual_classification": {
+            "macro_f1": 0.7619809437515043
+        },
+        "amazon_review_classification": {
+            "macro_f1": 0.5152108946679324
+        },
+        "massive_intent_classification": {
+            "macro_f1": 0.7895128475562229
+        },
+        "massive_scenario_classification": {
+            "macro_f1": 0.865430249169577
+        }
+    },
+    "Reranking": {
+        "esci": {
+            "ndcg@10": 0.9115815294581953
+        }
+    },
+    "Retrieval": {
+        "jagovfaqs_22k": {
+            "ndcg@10": 0.47387768939865055
+        },
+        "jaqket": {
+            "ndcg@10": 0.3956683977353904
+        },
+        "mrtydi": {
+            "ndcg@10": 0.1144234568266308
+        },
+        "nlp_journal_abs_intro": {
+            "ndcg@10": 0.6416096544574569
+        },
+        "nlp_journal_title_abs": {
+            "ndcg@10": 0.7023477497744102
+        },
+        "nlp_journal_title_intro": {
+            "ndcg@10": 0.4536720868647063
+        }
+    },
+    "STS": {
+        "jsick": {
+            "spearman": 0.781770693640686
+        },
+        "jsts": {
+            "spearman": 0.7680617109850311
+        }
+    },
+    "Clustering": {
+        "livedoor_news": {
+            "v_measure_score": 0.5301620892693397
+        },
+        "mewsc16": {
+            "v_measure_score": 0.4034776723308173
+        }
+    },
+    "PairClassification": {
+        "paws_x_ja": {
+            "binary_f1": 0.6238078417520311
+        }
+    }
+}
diff --git a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-sup/summary.json b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-sup/summary.json
@@ -0,0 +1,62 @@
+{
+    "Classification": {
+        "amazon_counterfactual_classification": {
+            "macro_f1": 0.7725250131648236
+        },
+        "amazon_review_classification": {
+            "macro_f1": 0.5341627023771393
+        },
+        "massive_intent_classification": {
+            "macro_f1": 0.7682863192709365
+        },
+        "massive_scenario_classification": {
+            "macro_f1": 0.8639396658321546
+        }
+    },
+    "Reranking": {
+        "esci": {
+            "ndcg@10": 0.9094717381883379
+        }
+    },
+    "Retrieval": {
+        "jagovfaqs_22k": {
+            "ndcg@10": 0.47038430326303626
+        },
+        "jaqket": {
+            "ndcg@10": 0.44101304795602897
+        },
+        "mrtydi": {
+            "ndcg@10": 0.11429128335865787
+        },
+        "nlp_journal_abs_intro": {
+            "ndcg@10": 0.43434267808785576
+        },
+        "nlp_journal_title_abs": {
+            "ndcg@10": 0.6240651697600803
+        },
+        "nlp_journal_title_intro": {
+            "ndcg@10": 0.3651687833824759
+        }
+    },
+    "STS": {
+        "jsick": {
+            "spearman": 0.787528927058734
+        },
+        "jsts": {
+            "spearman": 0.7781413957931619
+        }
+    },
+    "Clustering": {
+        "livedoor_news": {
+            "v_measure_score": 0.48448646364489634
+        },
+        "mewsc16": {
+            "v_measure_score": 0.43168522818790694
+        }
+    },
+    "PairClassification": {
+        "paws_x_ja": {
+            "binary_f1": 0.6235418875927891
+        }
+    }
+}