From 5e735d820df434bae068bbb9fad83c1d9f587094 Mon Sep 17 00:00:00 2001 From: xianchaowu Date: Wed, 16 Aug 2023 06:06:13 +0000 Subject: [PATCH] mbpp code analysis --- WizardCoder/bigcode_install.sh | 11 ++++++ WizardCoder/mbpp_1_a100.sh | 67 +++++++++++++++++++++++++++++++++ WizardCoder/mbpp_2_a100.sh | 22 +++++++++++ WizardCoder/mbpp_3_a100.sh | 15 ++++++++ WizardCoder/src/mbpp_gen.py | 1 + WizardCoder/src/process_mbpp.py | 26 +++++++------ 6 files changed, 131 insertions(+), 11 deletions(-) create mode 100644 WizardCoder/bigcode_install.sh create mode 100644 WizardCoder/mbpp_1_a100.sh create mode 100644 WizardCoder/mbpp_2_a100.sh create mode 100644 WizardCoder/mbpp_3_a100.sh diff --git a/WizardCoder/bigcode_install.sh b/WizardCoder/bigcode_install.sh new file mode 100644 index 0000000..15d337f --- /dev/null +++ b/WizardCoder/bigcode_install.sh @@ -0,0 +1,11 @@ +######################################################################### +# File Name: bigcode.sh +# Author: Xianchao Wu +# mail: xianchaow@nvidia.com +# Created Time: Tue Aug 15 07:29:09 2023 +######################################################################### +#!/bin/bash + +git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git +cd bigcode-evaluation-harness +pip install -e . diff --git a/WizardCoder/mbpp_1_a100.sh b/WizardCoder/mbpp_1_a100.sh new file mode 100644 index 0000000..79f1dd0 --- /dev/null +++ b/WizardCoder/mbpp_1_a100.sh @@ -0,0 +1,67 @@ +######################################################################### +# File Name: mbpp_1.sh +# Author: Xianchao Wu +# mail: xianchaow@nvidia.com +# Created Time: Thu Aug 10 06:39:18 2023 +######################################################################### +#!/bin/bash + +#model="/path/to/your/model" +#model="/workspace/asr/WizardLM/WizardCoder/models--WizardLM--WizardCoder-15B-V1.0/snapshots/926ca1b215c4631bc5f8c3e47173381452c23e5c" +model="/workspace/asr/Llama-X/src/checkpoints_wcode/models--WizardLM--WizardCoder-15B-V1.0/snapshots/69e87732535159460155972c3fac32a6241cc0ca" +temp=0.2 # temperature, TODO reset this 温度 +max_len=2048 +pred_num=2 #200 +num_seqs_per_iter=1 #2 + +output_path=preds/MBPP_T${temp}_N${pred_num} +mbpp_path=data/mbpp.test.jsonl # we provide this file in data/mbpp.test.zip + +mkdir -p ${output_path} +echo 'Output path: '$output_path +echo 'Model to eval: '$model + +# for debug NOTE +debug=0 +if [[ $debug == 1 ]] +then + gpu=1 + start_index=0 + end_index=2 + + CUDA_VISIBLE_DEVICES=$gpu python -m ipdb src/mbpp_gen.py --model ${model} \ + --start_index ${start_index} \ + --end_index ${end_index} \ + --temperature ${temp} \ + --num_seqs_per_iter ${num_seqs_per_iter} \ + --N ${pred_num} \ + --max_len ${max_len} \ + --output_path ${output_path} \ + --mbpp_path ${mbpp_path} + + exit 0 +fi + +# 500 problems, 63 per GPU if GPU=8 +index=0 +gpu_num=8 +for ((i = 0; i < $gpu_num; i++)); do + start_index=$((i * 50)) + end_index=$(((i + 1) * 50)) + + gpu=$((i)) + echo 'Running process #' ${i} 'from' $start_index 'to' $end_index 'on GPU' ${gpu} + ((index++)) + ( + CUDA_VISIBLE_DEVICES=$gpu python src/mbpp_gen.py --model ${model} \ + --start_index ${start_index} \ + --end_index ${end_index} \ + --temperature ${temp} \ + --num_seqs_per_iter ${num_seqs_per_iter} \ + --N ${pred_num} \ + --max_len ${max_len} \ + --output_path ${output_path} \ + --mbpp_path ${mbpp_path} + ) & + if (($index % $gpu_num == 0)); then wait; fi +done diff --git a/WizardCoder/mbpp_2_a100.sh b/WizardCoder/mbpp_2_a100.sh new file mode 100644 index 0000000..2f546fd --- /dev/null +++ b/WizardCoder/mbpp_2_a100.sh @@ -0,0 +1,22 @@ +######################################################################### +# File Name: mbpp_2_a100.sh +# Author: Xianchao Wu +# mail: xianchaow@nvidia.com +# Created Time: Tue Aug 15 06:47:10 2023 +######################################################################### +#!/bin/bash + +temp=0.2 +pred_num=2 + +output_path=preds/MBPP_T${temp}_N${pred_num} +echo ${output_path} + +mbpp_path=data/mbpp.test.jsonl # we provide this file in data/mbpp.test.zip + +echo 'Output path: '$output_path + +python -m ipdb src/process_mbpp.py --path ${output_path} \ + --out_path ${output_path}.jsonl \ + --mbpp_path ${mbpp_path} \ + --add_prompt diff --git a/WizardCoder/mbpp_3_a100.sh b/WizardCoder/mbpp_3_a100.sh new file mode 100644 index 0000000..bbc208b --- /dev/null +++ b/WizardCoder/mbpp_3_a100.sh @@ -0,0 +1,15 @@ +######################################################################### +# File Name: mbpp_3_a100.sh +# Author: Xianchao Wu +# mail: xianchaow@nvidia.com +# Created Time: Tue Aug 15 07:29:53 2023 +######################################################################### +#!/bin/bash + +jsonfn="/workspace/asr/WizardLM/WizardCoder/preds/MBPP_T0.2_N2.jsonl" + +#accelerate launch main.py --tasks mbpp \ +python -m ipdb main.py --tasks mbpp \ + --allow_code_execution \ + --load_generations_path $jsonfn \ + --model incoder-temperature-08 diff --git a/WizardCoder/src/mbpp_gen.py b/WizardCoder/src/mbpp_gen.py index d358603..36172b6 100644 --- a/WizardCoder/src/mbpp_gen.py +++ b/WizardCoder/src/mbpp_gen.py @@ -2,6 +2,7 @@ import argparse import pprint import sys +sys.path.append('/workspace/asr/WizardLM/WizardCoder/human-eval') import os import re from tqdm import tqdm diff --git a/WizardCoder/src/process_mbpp.py b/WizardCoder/src/process_mbpp.py index eee478b..d9b330d 100644 --- a/WizardCoder/src/process_mbpp.py +++ b/WizardCoder/src/process_mbpp.py @@ -1,3 +1,7 @@ +import sys + +sys.path.append('/workspace/asr/WizardLM/WizardCoder/human-eval') + from human_eval.data import stream_jsonl import glob from tqdm import tqdm @@ -30,7 +34,7 @@ def read_mbpp(path): parser.add_argument('--mbpp_path', type=str, help="") args = parser.parse_args() - +# Namespace(add_prompt=True, mbpp_path='data/mbpp.test.jsonl', out_path='preds/MBPP_T0.2_N2.jsonl', path='preds/MBPP_T0.2_N2') files = sorted(glob.glob(args.path + '/*.jsonl')) print("{} files in {}".format(len(files), args.path)) @@ -38,19 +42,19 @@ def read_mbpp(path): problems = read_mbpp(args.mbpp_path) output = [[] for _ in range(len(problems))] a = 0 -for code_file in tqdm(files, total=len(files)): +for code_file in tqdm(files, total=len(files)): # e.g., code_file='preds/MBPP_T0.2_N2/0.jsonl' codes = [c for c in stream_jsonl(code_file)] - if args.add_prompt: + if args.add_prompt: # True, in, NOTE for code in codes: task_id = code['task_id'] completion = code['completion'] if '```python' in completion: def_line = completion.index('```python') - completion = completion[def_line:].strip() - completion = completion.replace('```python', '') + completion = completion[def_line:].strip() # e.g., '```python\r\ndef remove_Occ(string, char):\r\n if char not in string:\r\n return string\r\n else:\r\n return string.replace(char, "", 1)[1:-1]\r\n```\r\n\r\nThe function takes two arguments: `string` and `char`. It first checks if the given character is present in the string. If not, it returns the original string. If the character is present, it uses the `replace()` method to remove all occurrences of the character except the first and last one. Finally, it returns the modified string.\r\n\r\nHere are some test cases:\r\n\r\n```python\r\nassert remove_Occ("hello","l") == "heo"\r\nassert remove_Occ("abcda","a") == "bcd"\r\nassert remove_Occ("PHP","P") == "H"\r\n```\r\n\r\nAll test cases pass.' + completion = completion.replace('```python', '') # '\r\ndef remove_Occ(string, char):\r\n if char not in string:\r\n return string\r\n else:\r\n return string.replace(char, "", 1)[1:-1]\r\n```\r\n\r\nThe function takes two arguments: `string` and `char`. It first checks if the given character is present in the string. If not, it returns the original string. If the character is present, it uses the `replace()` method to remove all occurrences of the character except the first and last one. Finally, it returns the modified string.\r\n\r\nHere are some test cases:\r\n\r\n\r\nassert remove_Occ("hello","l") == "heo"\r\nassert remove_Occ("abcda","a") == "bcd"\r\nassert remove_Occ("PHP","P") == "H"\r\n```\r\n\r\nAll test cases pass.' try: next_line = completion.index('\n```') - completion = completion[:next_line].strip() + completion = completion[:next_line].strip() # 'def remove_Occ(string, char):\r\n if char not in string:\r\n return string\r\n else:\r\n return string.replace(char, "", 1)[1:-1]' except: a += 1 if "__name__ == \"__main__\"" in completion: @@ -62,12 +66,12 @@ def read_mbpp(path): completion = completion[:next_line].strip() if "# Test examples" in completion: - next_line = completion.index('# Test examples') - completion = completion[:next_line].strip() + next_line = completion.index('# Test examples') # NOTE, e.g., 'def common_in_nested_lists(lst):\r\n common = []\r\n for i in range(len(lst)):\r\n for j in range(len(lst[i])):\r\n if lst[i][j] not in common:\r\n common.append(lst[i][j])\r\n return common\r\n\r\n# Test examples\r\nassert common_in_nested_lists([[12, 18, 23, 25, 45], [7, 12, 18, 24, 28], [1, 5, 8, 12, 15, 16, 18]])==[18, 12]\r\nassert common_in_nested_lists([[12, 5, 23, 25, 45], [7, 11, 5, 23, 28], [1, 5, 8, 18, 23, 16]])==[5,23]\r\nassert common_in_nested_lists([[2, 3,4, 1], [4, 5], [6,4, 8],[4, 5], [6, 8,4]])==[4]' + completion = completion[:next_line].strip() # NOTE, e.g., -> 'def common_in_nested_lists(lst):\r\n common = []\r\n for i in range(len(lst)):\r\n for j in range(len(lst[i])):\r\n if lst[i][j] not in common:\r\n common.append(lst[i][j])\r\n return common' output[task_id-11].append(completion) - -print("save to {}".format(args.out_path)) +# NOTE 核心逻辑就是,只抽取最纯洁的代码部分,放入jsonl文件中。 +print("save to {}".format(args.out_path)) # save to preds/MBPP_T0.2_N2.jsonl print(a) with open(args.out_path, "w", encoding="utf-8") as fout: - json.dump(output, fout) \ No newline at end of file + json.dump(output, fout)