Merge branch 'main' into integrate_multipl-e

bigcode-project · Apr 22, 2023 · 26c38c0 · 26c38c0
2 parents e269620 + 8b28d3a
commit 26c38c0
Show file tree

Hide file tree

Showing 16 changed files with 757 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -89,7 +89,7 @@ accelerate launch  main.py \
 * `limit` represents the number of problems to solve, if it's not provided all problems in the benchmark are selected. 
 * `allow_code_execution` is for executing the generated code: it is off by default, read the displayed warning before calling it to enable execution. 
 * Some models with custom code on the HF hub like [SantaCoder](https://huggingface.co/bigcode/santacoder) require calling `--trust_remote_code`, for private models add `--use_auth_token`.
-* `save_generations` saves the post-processed generations in a json file. You can also save references by calling `--save_references`
+* `save_generations` saves the post-processed generations in a json file at `save_generations_path` (by default `generations.json`). You can also save references by calling `--save_references`
 
 Some tasks don't require code execution such as
 `codexglue_code_to_text-<LANGUAGE>`/`codexglue_code_to_text-python-left`/`conala`/`concode` that use BLEU evaluation. In addition, we generate one candidate solution for each problem in these tasks, so use `n_samples=1` and `batch_size=1`. (Note that `batch_size` should always be equal or less than `n_samples`).
@@ -108,7 +108,7 @@ If you already have the generations in a json file from this evaluation harness
 Below is an example, be mind of specifying arguments proper to the task you are evaluating on, and note that `model` value here only serves for documenting the experiment.
 
 ```bash
-accelerate launch  main.py   --tasks mbpp  --allow_code_execution  --generations_path generations.json  --model incoder-temperature-08
+accelerate launch  main.py   --tasks mbpp  --allow_code_execution  --load_generations_path generations.json  --model incoder-temperature-08
 ```
 ## Docker containers
 For safety, we provide a Dockerfiles to do the execution inside a docker container. To do that, first, do the generation on your machine and save them in generations.json by adding the flag --generation_only to the command. Then build the docker container and run the evaluation inside it.

diff --git a/docs/README.md b/docs/README.md
@@ -218,6 +218,55 @@ These are classification tasks for Java and C, we provide the code to finetune m
 * [Java code equivalence prediction](https://huggingface.co/datasets/code_x_glue_cc_clone_detection_big_clone_bench)
 * [C code defect prediction](https://huggingface.co/datasets/code_x_glue_cc_defect_detection)
 
+## Natural language reasoning tasks
+
+These are reasoning tasks involving mathematical , symbolic and procedural reasoning with the task description / questions are in natural language.
+
+#### PAL - Program-aided Language Models
+
+In PAL, Large Language Models solve reasoning problems by generating reasoning chains with code. PAL datasets that are currently supported:
+
+* [GSM8K](https://huggingface.co/datasets/gsm8k) - Grade School Math 8K
+* [GSM-HARD](https://huggingface.co/datasets/reasoning-machines/gsm-hard) - Created by replacing the numbers in the questions of GSM8K with larger numbers 
+
+The model is prompted with few-shot examples of questions and reasoning steps as code. It then generates reasoning steps for a new question as Python code, which is executed to get the model's predicted answer.
+
+PAL uses two types of few-shot evaluation - 
+
+- `greedy` - samples one generation by greedy decoding and evaluates against reference answers
+- `majority_voting` - samples k (k=40 in paper) generations and takes majority voted answer to evaluate against the reference.
+
+**Task signature** : `pal-{dataset_name}-{evaluation_type}` (eg: `pal-gsm8k-greedy`,`pal-gsmhard-majority_voting`)
+
+Commands to run the evaluation:
+
+**Greedy Decoding**
+
+```python
+accelerate launch  main.py \
+  --model <MODEL_NAME> \
+  --max_length_generation <MAX_LENGTH> \
+  --tasks pal-gsm8k-greedy \
+  --n_samples 1 \
+  --batch_size 1 \
+  --do_sample False \
+  --allow_code_execution
+```
+
+**Majority Voting**
+
+```python
+accelerate launch  main.py \
+  --model <MODEL_NAME> \
+  --max_length_generation <MAX_LENGTH> \
+  --tasks pal-gsmhard-majority_voting \
+  --n_samples 40 \
+  --batch_size 1 \
+  --temperature 0.7 \
+  --top_p 0.95 \
+  --allow_code_execution
+```
+
 ## How to add a new benchmark
 
 We welcome contributions to add new code benchmarks to this evaluation harness. You can find a step-by-step guide in [`guide.md`](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/docs/guide.md).
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
@@ -32,7 +32,7 @@ def __init__(self, accelerator, model, tokenizer, args):
         self.args = args
 
         # setup arguments
-        self.output_path = args.output_path
+        self.metric_output_path = args.metric_output_path
 
         # code evaluation permission
         self.allow_code_execution = args.allow_code_execution
@@ -52,12 +52,11 @@ def generate_text(self, task_name):
             args=self.args,
         )
         references = [task.get_reference(dataset[i]) for i in range(n_tasks)]
-        if len(generations[0]) != self.args.n_samples and not self.args.generations_path:
+        if len(generations[0]) > self.args.n_samples:
             generations = [l[: self.args.n_samples] for l in generations]
-            if self.accelerator.is_main_process:
-                warnings.warn(
-                    "Number of tasks wasn't proportional to number of devices, we removed extra predictions"
-                )
+            warnings.warn(
+                f"Number of tasks wasn't proportional to number of devices, we removed extra predictions to only keep nsamples={self.args.n_samples}"
+            )
         return generations, references
 
     def evaluate(self, task_name):
@@ -68,19 +67,20 @@ def evaluate(self, task_name):
         generations, references = self.generate_text(task_name)
 
         if self.accelerator.is_main_process:
-            if not self.args.generations_path:
+            if not self.args.load_generations_path:
                 if self.args.save_generations:
-                    with open("generations.json", "w") as fp:
+                    with open(self.args.save_generations_path, "w") as fp:
                         json.dump(generations, fp)
-                        print("generations were saved")
+                        print(f"generations were saved at {self.args.save_generations_path}")
                 if self.args.save_references:
                     with open("references.json", "w") as fp:
                         json.dump(references, fp)
-                        print("references were saved")
+                        print("references were saved at references.json")
 
             # make sure tokenizer plays nice with multiprocessing
             os.environ["TOKENIZERS_PARALLELISM"] = "false"
             if self.allow_code_execution and task.requires_execution:
                 os.environ["HF_ALLOW_CODE_EVAL"] = "1"
+            print("Evaluating generations...")
             results = task.process_results(generations, references)
             return results
diff --git a/lm_eval/generation.py b/lm_eval/generation.py
@@ -1,9 +1,10 @@
-from tqdm import tqdm
 import json
+from math import ceil
 
+from accelerate.utils import set_seed
 from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
 from transformers import StoppingCriteria, StoppingCriteriaList
-from accelerate.utils import set_seed
 
 from lm_eval.utils import TokenizedDataset, complete_code
 
@@ -35,9 +36,9 @@ def __call__(self, input_ids, scores, **kwargs):
 
 
 def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, args):
-    if args.generations_path:
+    if args.load_generations_path:
         # load generated code
-        with open(args.generations_path) as fp:
+        with open(args.load_generations_path) as fp:
             generations = json.load(fp)
             if accelerator.is_main_process:
                 print(
@@ -56,13 +57,15 @@ def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks,
         "max_length": args.max_length_generation,
     }
     if task.stop_words:
+        if tokenizer.eos_token:
+            task.stop_words.append(tokenizer.eos_token)
         gen_kwargs["stopping_criteria"] = StoppingCriteriaList(
             [EndOfFunctionCriteria(0, task.stop_words, tokenizer)]
         )
 
     if accelerator.is_main_process:
         print(f"number of problems for this task is {n_tasks}")
-    n_copies = args.n_samples // args.batch_size
+    n_copies = ceil(args.n_samples / args.batch_size)
 
     ds_tokenized = TokenizedDataset(
         task,
@@ -77,8 +80,9 @@ def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks,
 
     # do not confuse args.batch_size, which is actually the num_return_sequences
     ds_loader = DataLoader(ds_tokenized, batch_size=1)
+    model = model.to(accelerator.device)
+    ds_loader = accelerator.prepare(ds_loader)
 
-    model, ds_loader = accelerator.prepare(model, ds_loader)
     generations = complete_code(
         task,
         accelerator,

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
@@ -1,8 +1,7 @@
 from pprint import pprint
 
 from . import (apps, codexglue_code_to_text, codexglue_text_to_text, conala,
-               concode, ds1000, humaneval, mbpp, multiple)
-
+               concode, ds1000, gsm, humaneval, mbpp, multiple)
 
 TASK_REGISTRY = {
     **apps.create_all_tasks(),
@@ -15,6 +14,7 @@
     **ds1000.create_all_tasks(),
     "humaneval": humaneval.HumanEval,
     "mbpp": mbpp.MBPP,
+    **gsm.create_all_tasks(),
 }
 
 ALL_TASKS = sorted(list(TASK_REGISTRY))

diff --git a/lm_eval/tasks/custom_metrics/pal_metric/pal_code_exec.py b/lm_eval/tasks/custom_metrics/pal_metric/pal_code_exec.py
@@ -0,0 +1,114 @@
+import os
+import warnings
+from collections import Counter, defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from lm_eval.tasks.custom_metrics.pal_metric.python_executor import run_program
+
+# adapted from https://github.com/huggingface/evaluate/blob/main/metrics/code_eval/code_eval.py
+
+_WARNING = """
+################################################################################
+                                  !!!WARNING!!!
+################################################################################
+The "code_eval" metric executes untrusted model-generated code in Python.
+Although it is highly unlikely that model-generated code will do something
+overtly malicious in response to this test suite, model-generated code may act
+destructively due to a lack of model capability or alignment.
+Users are strongly encouraged to sandbox this evaluation suite so that it
+does not perform destructive actions on their host or network. For more
+information on how OpenAI sandboxes its code, see the paper "Evaluating Large
+Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
+Once you have read this disclaimer and taken appropriate precautions,
+set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
+with:
+>>> import os
+>>> os.environ["HF_ALLOW_CODE_EVAL"] = "1"
+################################################################################\
+"""
+
+
+def compute(
+    predictions,
+    references,
+    num_workers=4,
+    timeout=3.0,
+    majority_voting=False,
+    answer_symbol=None,
+):
+    """
+    Returns the scores
+
+    :param majority_voting: bool
+        Takes majority voted answer to evaluate against the reference , defaults to False
+
+    :param answer_symbol: str
+        If speficifed the result of execution is fetched from the program's global context,
+        the program is expected to have the variable name mentioned in `answer_symbol` that is available in globals.
+        if not specified, the result are fetched from the stdout of the execution
+        defaults to None.
+
+    """
+
+    if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
+        raise ValueError(_WARNING)
+
+    if os.name == "nt":
+        raise NotImplementedError("This metric is currently not supported on Windows.")
+
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        futures = []
+        completion_id = Counter()
+        n_samples = 0
+        results = defaultdict(list)
+
+        for task_id, candidates in enumerate(predictions):
+            for candidate in candidates:
+                args = (candidate, timeout, task_id, completion_id[task_id])
+                if answer_symbol:
+                    args += (answer_symbol,)
+                future = executor.submit(run_program, *args)
+                futures.append(future)
+                completion_id[task_id] += 1
+                n_samples += 1
+
+        for future in as_completed(futures):
+            result = future.result()
+            results[result["task_id"]].append((result["completion_id"], result))
+
+    answers = [None] * len(results)
+    for result in results.values():
+        result.sort()
+        task_id = result[0][1]["task_id"]
+        # filtering the failed generations to avoid influencing majority voting
+        eval_answers = [
+            r[1]["result"]
+            for r in result
+            if isinstance(r[1]["result"], str)
+            and not r[1]["result"].startswith("failed:")
+        ]
+        # if all generations are failed - default to empty str for soring
+        eval_answers = [""] if len(eval_answers) == 0 else eval_answers
+        if majority_voting:
+            counter = Counter(eval_answers)
+            eval_answers = [counter.most_common()[0][0]]
+
+        if not majority_voting and len(eval_answers) > 1:
+            warnings.warn(
+                f"Multiple generations found for a task without setting `majority_voting` to True, defaulting answers from first generation"
+            )
+        answers[task_id] = eval_answers[0]
+
+    scores = []
+    # Number of code generated that failed execution.
+    errored = 0
+    for task_id, (ans, ref) in enumerate(zip(answers, references)):
+        try:
+            score = 1 if abs(float(ans) - float(ref)) < 1e-3 else 0
+        except ValueError as e:
+            errored += 1
+            score = 0
+
+        scores.append(score)
+
+    return {"accuracy": sum(scores) / len(scores), "num_failed_execution": errored}