diff --git a/bigcode_eval/tasks/codexglue_text_to_text.py b/bigcode_eval/tasks/codexglue_text_to_text.py index bc01c1d4a..ab3eabf42 100644 --- a/bigcode_eval/tasks/codexglue_text_to_text.py +++ b/bigcode_eval/tasks/codexglue_text_to_text.py @@ -64,7 +64,7 @@ def get_dataset(self): def fewshot_examples(self): """Loads and returns the few-shot examples for the task if they exist.""" with open( - "lm_eval/tasks/few_shot_examples/codexglue_text_to_text_few_shot_prompts.json", + "bigcode_eval/tasks/few_shot_examples/codexglue_text_to_text_few_shot_prompts.json", "r", ) as file: examples = json.load(file) diff --git a/bigcode_eval/tasks/conala.py b/bigcode_eval/tasks/conala.py index 35cd0f87b..80387db9a 100644 --- a/bigcode_eval/tasks/conala.py +++ b/bigcode_eval/tasks/conala.py @@ -47,7 +47,7 @@ def get_dataset(self): def fewshot_examples(self): """Loads and returns the few-shot examples for the task if they exist.""" with open( - "lm_eval/tasks/few_shot_examples/conala_few_shot_prompts.json", "r" + "bigcode_eval/tasks/few_shot_examples/conala_few_shot_prompts.json", "r" ) as file: examples = json.load(file) return examples diff --git a/bigcode_eval/tasks/concode.py b/bigcode_eval/tasks/concode.py index 3895cf25a..7be43497c 100644 --- a/bigcode_eval/tasks/concode.py +++ b/bigcode_eval/tasks/concode.py @@ -47,7 +47,7 @@ def get_dataset(self): def fewshot_examples(self): """Loads and returns the few-shot examples for the task if they exist.""" with open( - "lm_eval/tasks/few_shot_examples/concode_few_shot_prompts.json", "r" + "bigcode_eval/tasks/few_shot_examples/concode_few_shot_prompts.json", "r" ) as file: examples = json.load(file) return examples diff --git a/bigcode_eval/tasks/gsm.py b/bigcode_eval/tasks/gsm.py index f2d350cfd..478c2080f 100644 --- a/bigcode_eval/tasks/gsm.py +++ b/bigcode_eval/tasks/gsm.py @@ -105,7 +105,7 @@ def get_dataset(self): def fewshot_examples(self): """Loads and returns the few-shot examples for the task if they exist.""" with open( - "lm_eval/tasks/few_shot_examples/gsm8k_few_shot_prompts.json", + "bigcode_eval/tasks/few_shot_examples/gsm8k_few_shot_prompts.json", "r", ) as file: examples = json.load(file) diff --git a/docs/README.md b/docs/README.md index 5a910720a..2f31b7da5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -110,7 +110,7 @@ accelerate launch main.py \ ``` -There is also a version to run the OpenAI API on HumanEvalPack at `lm_eval/tasks/humanevalpack_openai.py`. It requires the `openai` package that can be installed via `pip install openai`. You will need to set the environment variables `OPENAI_ORGANIZATION` and `OPENAI_API_KEY`. Then you may want to modify the global variables defined in the script, such as `LANGUAGE`. Finally, you can run it with `python lm_eval/tasks/humanevalpack_openai.py`. +There is also a version to run the OpenAI API on HumanEvalPack at `bigcode_eval/tasks/humanevalpack_openai.py`. It requires the `openai` package that can be installed via `pip install openai`. You will need to set the environment variables `OPENAI_ORGANIZATION` and `OPENAI_API_KEY`. Then you may want to modify the global variables defined in the script, such as `LANGUAGE`. Finally, you can run it with `python bigcode_eval/tasks/humanevalpack_openai.py`. ### InstructHumanEval diff --git a/docs/guide.md b/docs/guide.md index b82c6cb60..0b5910515 100644 --- a/docs/guide.md +++ b/docs/guide.md @@ -16,10 +16,10 @@ pip install -r requirements.txt ## Creating Your Task File -From the `bigcode-evaluation-harness` project root, copy over the `new_task.py` template to `lm_eval/tasks`. +From the `bigcode-evaluation-harness` project root, copy over the `new_task.py` template to `bigcode_eval/tasks`. ```sh -cp template/new_task.py lm_eval/tasks/.py +cp template/new_task.py bigcode_eval/tasks/.py ``` ## Task Heading @@ -81,11 +81,11 @@ def get_prompt(self, doc): return "" ``` -If the prompt involves few-shot examples, you first need to save them in a json `_few_shot_prompts.json` in `lm_eval/tasks/few_shot_example` and then load them in `fewshot_examples` method like this: +If the prompt involves few-shot examples, you first need to save them in a json `_few_shot_prompts.json` in `bigcode_eval/tasks/few_shot_example` and then load them in `fewshot_examples` method like this: ```python def fewshot_examples(self): - with open("lm_eval/tasks/few_shot_examples/_few_shot_prompts.json", "r") as file: + with open("bigcode_eval/tasks/few_shot_examples/_few_shot_prompts.json", "r") as file: examples = json.load(file) return examples ``` @@ -113,12 +113,12 @@ def process_results(self, generations, references): return {} ``` -You need to load your metric and run it. Check Hugging Face `evaluate` [library](https://huggingface.co/docs/evaluate/index) for the available metrics. For example [code_eval](https://huggingface.co/spaces/evaluate-metric/code_eval) for pass@k, [BLEU](https://huggingface.co/spaces/evaluate-metric/bleu) for BLEU score and [apps_metric](https://huggingface.co/spaces/codeparrot/apps_metric) are implemented. If you cannot find your desired metric, you can either add it to the `evaluate` library or implement it in the `lm_eval/tasks/custom_metrics` folder and import it from there. +You need to load your metric and run it. Check Hugging Face `evaluate` [library](https://huggingface.co/docs/evaluate/index) for the available metrics. For example [code_eval](https://huggingface.co/spaces/evaluate-metric/code_eval) for pass@k, [BLEU](https://huggingface.co/spaces/evaluate-metric/bleu) for BLEU score and [apps_metric](https://huggingface.co/spaces/codeparrot/apps_metric) are implemented. If you cannot find your desired metric, you can either add it to the `evaluate` library or implement it in the `bigcode_eval/tasks/custom_metrics` folder and import it from there. ### Registering Your Task -Now's a good time to register your task to expose it for usage. All you'll need to do is import your task module in `lm_eval/tasks/__init__.py` and provide an entry in the `TASK_REGISTRY` dictionary with the key as the name of your benchmark task (in the form it'll be referred to in the command line) and the value as the task class. See how it's done for other tasks in the [file](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/lm_eval/tasks/__init__.py). +Now's a good time to register your task to expose it for usage. All you'll need to do is import your task module in `bigcode_eval/tasks/__init__.py` and provide an entry in the `TASK_REGISTRY` dictionary with the key as the name of your benchmark task (in the form it'll be referred to in the command line) and the value as the task class. See how it's done for other tasks in the [file](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/__init__.py). ## Task submission @@ -136,7 +136,7 @@ Few-shot tasks are easier to conduct, but if you need to add the finetuning scri ## Code formatting You can format your changes and perform `black` standard checks ```sh -black lm_eval/tasks/.py +black bigcode_eval/tasks/.py ``` ## Task documentation Please document your task with advised parameters for execution from litterature in the [docs](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/docs/README.md) like it's done for the other benchamrks. diff --git a/templates/new_task.py b/templates/new_task.py index 6d04608ca..228bcb29b 100644 --- a/templates/new_task.py +++ b/templates/new_task.py @@ -37,7 +37,7 @@ def get_dataset(self): return [] def fewshot_examples(self): - # TODO: load few-shot examples (from lm_eval/tasks/fewshot_examples) if they exist + # TODO: load few-shot examples (from bigcode_eval/tasks/fewshot_examples) if they exist """Loads and returns the few-shot examples for the task if they exist.""" pass