From a1d628e4771fe189f3c44681c496e18540b161cc Mon Sep 17 00:00:00 2001 From: Naoaki Okazaki Date: Sun, 30 Jun 2024 00:44:02 +0900 Subject: [PATCH] Updated with the latest experimental results. --- _data/models.yml | 16 ++----- assets/data.js | 117 ++++++++++++++++------------------------------- 2 files changed, 44 insertions(+), 89 deletions(-) diff --git a/_data/models.yml b/_data/models.yml index e0a3203..01f358d 100644 --- a/_data/models.yml +++ b/_data/models.yml @@ -64,7 +64,7 @@ name: KARAKURI LM 70B Chat v0.1 size: 70 type: instruct -- basename: Swallow-MX-8x7b-NVE-v0.1 +- basename: Swallow-MX 8x7B v0.1 missing: Japanese tasks, Japanese MT-bench tasks, English tasks model: karakuri-ai/karakuri-lm-8x7b-instruct-v0.1 name: KARAKURI LM 8x7B Instruct v0.1 @@ -179,8 +179,7 @@ size: 47 type: instruct - basename: '' - missing: JCom, JEMHopQA, NIILC, JSQuAD, Ja Avg, OpenBookQA, TriviaQA, HellaSwag, - SQuAD2, XWINO, GSM8K, BBH, HumanEval, En Avg + missing: BBH, HumanEval model: mistralai/Mixtral-8x7B-v0.1 name: Mixtral-8x7B-v0.1 size: 47 @@ -234,7 +233,8 @@ size: 7 type: base - basename: '' - missing: Japanese tasks, English tasks + missing: JCom, JEMHopQA, NIILC, JSQuAD, MGSM, WMT20 (en-ja), WMT20 (ja-en), Ja Avg, + OpenBookQA, TriviaQA, HellaSwag, SQuAD2, XWINO, MMLU, GSM8K, BBH, En Avg model: sbintuitions/sarashina2-13b name: Sarashina2-13B size: 13 @@ -278,15 +278,9 @@ - basename: '' missing: '' model: tokyotech-llm/Swallow-MX-8x7b-NVE-v0.1 - name: Swallow-MX v0.1 + name: Swallow-MX 8x7B v0.1 size: 47 type: base -- basename: instruct・chatモデルではない - missing: Japanese MT-bench tasks - model: tokyotech-llm/Swallow-MX-8x7b-NVE-v0.1 - name: Swallow-MX-8x7b-NVE-v0.1 - size: 47 - type: instruct - basename: '' missing: '' model: 01-ai/Yi-1.5-6B diff --git a/assets/data.js b/assets/data.js index 72fb6a2..6ca38b3 100644 --- a/assets/data.js +++ b/assets/data.js @@ -432,7 +432,7 @@ const dataSet = [ "Type": "inst", "Model": "karakuri-ai/karakuri-lm-8x7b-instruct-v0.1", "Name": "KARAKURI LM 8x7B Instruct v0.1", - "Base name": "Swallow-MX-8x7b-NVE-v0.1", + "Base name": "Swallow-MX 8x7B v0.1", "Size (B)": 47, "Coding": NaN, "Extraction": NaN, @@ -1184,29 +1184,29 @@ const dataSet = [ "Stem": NaN, "Writing": NaN, "Ja MT-Bench": NaN, - "JCom": -1.0, - "JEMHopQA": -1.0, - "NIILC": -1.0, - "JSQuAD": -1.0, + "JCom": 0.8436, + "JEMHopQA": 0.5038, + "NIILC": 0.3745, + "JSQuAD": 0.8952, "XL-Sum": 0.2205, "MGSM": 0.328, "WMT20 (en-ja)": 0.1948, "WMT20 (ja-en)": 0.2024, - "Ja Avg": -0.5054, - "JMMLU": -1.0, + "Ja Avg": 0.3079, + "JMMLU": 0.5162, "JHumanEval": -1.0, - "OpenBookQA": -1.0, - "TriviaQA": -1.0, - "HellaSwag": -1.0, - "SQuAD2": -1.0, - "XWINO": -1.0, + "OpenBookQA": 0.388, + "TriviaQA": 0.7938, + "HellaSwag": 0.664, + "SQuAD2": 0.3612, + "XWINO": 0.9217, "MMLU": 0.7038, - "GSM8K": -1.0, + "GSM8K": 0.5823, "BBH": -1.0, "HumanEval": -1.0, - "En Avg": -0.8107, + "En Avg": 0.2683, "SortKey": "Mixtral--v0.1047", - "Missing": "JCom, JEMHopQA, NIILC, JSQuAD, Ja Avg, OpenBookQA, TriviaQA, HellaSwag, SQuAD2, XWINO, GSM8K, BBH, HumanEval, En Avg" + "Missing": "BBH, HumanEval" }, { "Type": "base", @@ -1535,29 +1535,29 @@ const dataSet = [ "Stem": NaN, "Writing": NaN, "Ja MT-Bench": NaN, - "JCom": NaN, - "JEMHopQA": NaN, - "NIILC": NaN, - "JSQuAD": NaN, - "XL-Sum": NaN, - "MGSM": NaN, - "WMT20 (en-ja)": NaN, - "WMT20 (ja-en)": NaN, - "Ja Avg": NaN, - "JMMLU": NaN, - "JHumanEval": NaN, - "OpenBookQA": NaN, - "TriviaQA": NaN, - "HellaSwag": NaN, - "SQuAD2": NaN, - "XWINO": NaN, - "MMLU": NaN, - "GSM8K": NaN, - "BBH": NaN, - "HumanEval": NaN, - "En Avg": NaN, + "JCom": -1.0, + "JEMHopQA": -1.0, + "NIILC": -1.0, + "JSQuAD": -1.0, + "XL-Sum": 0.1588, + "MGSM": -1.0, + "WMT20 (en-ja)": -1.0, + "WMT20 (ja-en)": -1.0, + "Ja Avg": -0.7841, + "JMMLU": -1.0, + "JHumanEval": 0.0, + "OpenBookQA": -1.0, + "TriviaQA": -1.0, + "HellaSwag": -1.0, + "SQuAD2": -1.0, + "XWINO": -1.0, + "MMLU": -1.0, + "GSM8K": -1.0, + "BBH": -1.0, + "HumanEval": 0.0, + "En Avg": -0.8889, "SortKey": "Sarashina2-013", - "Missing": "Japanese tasks, English tasks" + "Missing": "JCom, JEMHopQA, NIILC, JSQuAD, MGSM, WMT20 (en-ja), WMT20 (ja-en), Ja Avg, OpenBookQA, TriviaQA, HellaSwag, SQuAD2, XWINO, MMLU, GSM8K, BBH, En Avg" }, { "Type": "base", @@ -1796,7 +1796,7 @@ const dataSet = [ { "Type": "base", "Model": "tokyotech-llm/Swallow-MX-8x7b-NVE-v0.1", - "Name": "Swallow-MX v0.1", + "Name": "Swallow-MX 8x7B v0.1", "Base name": "", "Size (B)": 47, "Coding": NaN, @@ -1829,48 +1829,9 @@ const dataSet = [ "BBH": 0.6822, "HumanEval": 0.4, "En Avg": 0.5977, - "SortKey": "Swallow-MX v0.1047", + "SortKey": "Swallow-MX v0.1047", "Missing": "" }, - { - "Type": "inst", - "Model": "tokyotech-llm/Swallow-MX-8x7b-NVE-v0.1", - "Name": "Swallow-MX-8x7b-NVE-v0.1", - "Base name": "instruct・chatモデルではない", - "Size (B)": 47, - "Coding": -1.0, - "Extraction": -1.0, - "Humanities": -1.0, - "Math": -1.0, - "Reasoning": -1.0, - "Roleplay": -1.0, - "Stem": -1.0, - "Writing": -1.0, - "Ja MT-Bench": -1.0, - "JCom": 0.9223, - "JEMHopQA": 0.5519, - "NIILC": 0.5779, - "JSQuAD": 0.9146, - "XL-Sum": 0.262, - "MGSM": 0.42, - "WMT20 (en-ja)": 0.2718, - "WMT20 (ja-en)": 0.209, - "Ja Avg": 0.5048, - "JMMLU": 0.5623, - "JHumanEval": 0.3567, - "OpenBookQA": 0.342, - "TriviaQA": 0.7734, - "HellaSwag": 0.6505, - "SQuAD2": 0.3479, - "XWINO": 0.9166, - "MMLU": 0.6941, - "GSM8K": 0.5724, - "BBH": 0.6822, - "HumanEval": 0.4, - "En Avg": 0.5977, - "SortKey": "Swallow-MX--NVE-v0.1047", - "Missing": "Japanese MT-bench tasks" - }, { "Type": "base", "Model": "01-ai/Yi-1.5-6B",