diff --git a/lm_eval/tasks/benczechmark/utils.py b/lm_eval/tasks/benczechmark/utils.py index 602a9a454e..ae9078bcc4 100644 --- a/lm_eval/tasks/benczechmark/utils.py +++ b/lm_eval/tasks/benczechmark/utils.py @@ -286,15 +286,18 @@ def aggregation(self) -> dict: "acc": mean, } + # MMLU multi-choice style (A....Z) ANSWER_LETTERS = [chr(ord('A') + i) for i in range(ord('Z') - ord('A') + 1)] + def mmlu_get_choice(dataset): choice = [c for c in ANSWER_LETTERS if c in dataset.keys()] if len(choice) == 0: raise ValueError(f"No answer columns found in dataset") return choice + def mmlu_get_answer_index(dataset): return ANSWER_LETTERS.index(dataset["correct_answer"]) @@ -308,12 +311,17 @@ def cermat_get_choice(dataset): raise ValueError(f"Invalid number of choices: {len(dataset['choices'])}") +def history_ir_get_choice(dataset): + return ["A", "B", "C", "D"] + + def mmlu_get_question_text(dataset): dataset_answer_keys = mmlu_get_choice(dataset) question_text = dataset['question'].strip() choices_text = "\n".join(f"{c}. {dataset[c]}" for c in dataset_answer_keys) return f"{question_text}\n{choices_text}\nOdpověď: " + def mmlu_get_question_text_umimeto(dataset): """ Umimeto has really bad questions, which require the topic at minimum to make sense @@ -322,4 +330,4 @@ def mmlu_get_question_text_umimeto(dataset): question_text = dataset['question'].strip() choices_text = "\n".join(f"{c}. {dataset[c]}" for c in dataset_answer_keys) topic = dataset["topic"] - return f"{topic}: {question_text}\n{choices_text}\nOdpověď: " \ No newline at end of file + return f"{topic}: {question_text}\n{choices_text}\nOdpověď: "