Skip to content

Commit

Permalink
mbpp code analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
Xianchao-Wu committed Aug 16, 2023
1 parent 1de0298 commit 5e735d8
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 11 deletions.
11 changes: 11 additions & 0 deletions WizardCoder/bigcode_install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#########################################################################
# File Name: bigcode.sh
# Author: Xianchao Wu
# mail: [email protected]
# Created Time: Tue Aug 15 07:29:09 2023
#########################################################################
#!/bin/bash

git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git
cd bigcode-evaluation-harness
pip install -e .
67 changes: 67 additions & 0 deletions WizardCoder/mbpp_1_a100.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#########################################################################
# File Name: mbpp_1.sh
# Author: Xianchao Wu
# mail: [email protected]
# Created Time: Thu Aug 10 06:39:18 2023
#########################################################################
#!/bin/bash

#model="/path/to/your/model"
#model="/workspace/asr/WizardLM/WizardCoder/models--WizardLM--WizardCoder-15B-V1.0/snapshots/926ca1b215c4631bc5f8c3e47173381452c23e5c"
model="/workspace/asr/Llama-X/src/checkpoints_wcode/models--WizardLM--WizardCoder-15B-V1.0/snapshots/69e87732535159460155972c3fac32a6241cc0ca"
temp=0.2 # temperature, TODO reset this 温度
max_len=2048
pred_num=2 #200
num_seqs_per_iter=1 #2

output_path=preds/MBPP_T${temp}_N${pred_num}
mbpp_path=data/mbpp.test.jsonl # we provide this file in data/mbpp.test.zip

mkdir -p ${output_path}
echo 'Output path: '$output_path
echo 'Model to eval: '$model

# for debug NOTE
debug=0
if [[ $debug == 1 ]]
then
gpu=1
start_index=0
end_index=2

CUDA_VISIBLE_DEVICES=$gpu python -m ipdb src/mbpp_gen.py --model ${model} \
--start_index ${start_index} \
--end_index ${end_index} \
--temperature ${temp} \
--num_seqs_per_iter ${num_seqs_per_iter} \
--N ${pred_num} \
--max_len ${max_len} \
--output_path ${output_path} \
--mbpp_path ${mbpp_path}

exit 0
fi

# 500 problems, 63 per GPU if GPU=8
index=0
gpu_num=8
for ((i = 0; i < $gpu_num; i++)); do
start_index=$((i * 50))
end_index=$(((i + 1) * 50))

gpu=$((i))
echo 'Running process #' ${i} 'from' $start_index 'to' $end_index 'on GPU' ${gpu}
((index++))
(
CUDA_VISIBLE_DEVICES=$gpu python src/mbpp_gen.py --model ${model} \
--start_index ${start_index} \
--end_index ${end_index} \
--temperature ${temp} \
--num_seqs_per_iter ${num_seqs_per_iter} \
--N ${pred_num} \
--max_len ${max_len} \
--output_path ${output_path} \
--mbpp_path ${mbpp_path}
) &
if (($index % $gpu_num == 0)); then wait; fi
done
22 changes: 22 additions & 0 deletions WizardCoder/mbpp_2_a100.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#########################################################################
# File Name: mbpp_2_a100.sh
# Author: Xianchao Wu
# mail: [email protected]
# Created Time: Tue Aug 15 06:47:10 2023
#########################################################################
#!/bin/bash

temp=0.2
pred_num=2

output_path=preds/MBPP_T${temp}_N${pred_num}
echo ${output_path}

mbpp_path=data/mbpp.test.jsonl # we provide this file in data/mbpp.test.zip

echo 'Output path: '$output_path

python -m ipdb src/process_mbpp.py --path ${output_path} \
--out_path ${output_path}.jsonl \
--mbpp_path ${mbpp_path} \
--add_prompt
15 changes: 15 additions & 0 deletions WizardCoder/mbpp_3_a100.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#########################################################################
# File Name: mbpp_3_a100.sh
# Author: Xianchao Wu
# mail: [email protected]
# Created Time: Tue Aug 15 07:29:53 2023
#########################################################################
#!/bin/bash

jsonfn="/workspace/asr/WizardLM/WizardCoder/preds/MBPP_T0.2_N2.jsonl"

#accelerate launch main.py --tasks mbpp \
python -m ipdb main.py --tasks mbpp \
--allow_code_execution \
--load_generations_path $jsonfn \
--model incoder-temperature-08
1 change: 1 addition & 0 deletions WizardCoder/src/mbpp_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import argparse
import pprint
import sys
sys.path.append('/workspace/asr/WizardLM/WizardCoder/human-eval')
import os
import re
from tqdm import tqdm
Expand Down
26 changes: 15 additions & 11 deletions WizardCoder/src/process_mbpp.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
import sys

sys.path.append('/workspace/asr/WizardLM/WizardCoder/human-eval')

from human_eval.data import stream_jsonl
import glob
from tqdm import tqdm
Expand Down Expand Up @@ -30,27 +34,27 @@ def read_mbpp(path):
parser.add_argument('--mbpp_path', type=str, help="")

args = parser.parse_args()

# Namespace(add_prompt=True, mbpp_path='data/mbpp.test.jsonl', out_path='preds/MBPP_T0.2_N2.jsonl', path='preds/MBPP_T0.2_N2')

files = sorted(glob.glob(args.path + '/*.jsonl'))
print("{} files in {}".format(len(files), args.path))

problems = read_mbpp(args.mbpp_path)
output = [[] for _ in range(len(problems))]
a = 0
for code_file in tqdm(files, total=len(files)):
for code_file in tqdm(files, total=len(files)): # e.g., code_file='preds/MBPP_T0.2_N2/0.jsonl'
codes = [c for c in stream_jsonl(code_file)]
if args.add_prompt:
if args.add_prompt: # True, in, NOTE
for code in codes:
task_id = code['task_id']
completion = code['completion']
if '```python' in completion:
def_line = completion.index('```python')
completion = completion[def_line:].strip()
completion = completion.replace('```python', '')
completion = completion[def_line:].strip() # e.g., '```python\r\ndef remove_Occ(string, char):\r\n if char not in string:\r\n return string\r\n else:\r\n return string.replace(char, "", 1)[1:-1]\r\n```\r\n\r\nThe function takes two arguments: `string` and `char`. It first checks if the given character is present in the string. If not, it returns the original string. If the character is present, it uses the `replace()` method to remove all occurrences of the character except the first and last one. Finally, it returns the modified string.\r\n\r\nHere are some test cases:\r\n\r\n```python\r\nassert remove_Occ("hello","l") == "heo"\r\nassert remove_Occ("abcda","a") == "bcd"\r\nassert remove_Occ("PHP","P") == "H"\r\n```\r\n\r\nAll test cases pass.'
completion = completion.replace('```python', '') # '\r\ndef remove_Occ(string, char):\r\n if char not in string:\r\n return string\r\n else:\r\n return string.replace(char, "", 1)[1:-1]\r\n```\r\n\r\nThe function takes two arguments: `string` and `char`. It first checks if the given character is present in the string. If not, it returns the original string. If the character is present, it uses the `replace()` method to remove all occurrences of the character except the first and last one. Finally, it returns the modified string.\r\n\r\nHere are some test cases:\r\n\r\n\r\nassert remove_Occ("hello","l") == "heo"\r\nassert remove_Occ("abcda","a") == "bcd"\r\nassert remove_Occ("PHP","P") == "H"\r\n```\r\n\r\nAll test cases pass.'
try:
next_line = completion.index('\n```')
completion = completion[:next_line].strip()
completion = completion[:next_line].strip() # 'def remove_Occ(string, char):\r\n if char not in string:\r\n return string\r\n else:\r\n return string.replace(char, "", 1)[1:-1]'
except:
a += 1
if "__name__ == \"__main__\"" in completion:
Expand All @@ -62,12 +66,12 @@ def read_mbpp(path):
completion = completion[:next_line].strip()

if "# Test examples" in completion:
next_line = completion.index('# Test examples')
completion = completion[:next_line].strip()
next_line = completion.index('# Test examples') # NOTE, e.g., 'def common_in_nested_lists(lst):\r\n common = []\r\n for i in range(len(lst)):\r\n for j in range(len(lst[i])):\r\n if lst[i][j] not in common:\r\n common.append(lst[i][j])\r\n return common\r\n\r\n# Test examples\r\nassert common_in_nested_lists([[12, 18, 23, 25, 45], [7, 12, 18, 24, 28], [1, 5, 8, 12, 15, 16, 18]])==[18, 12]\r\nassert common_in_nested_lists([[12, 5, 23, 25, 45], [7, 11, 5, 23, 28], [1, 5, 8, 18, 23, 16]])==[5,23]\r\nassert common_in_nested_lists([[2, 3,4, 1], [4, 5], [6,4, 8],[4, 5], [6, 8,4]])==[4]'
completion = completion[:next_line].strip() # NOTE, e.g., -> 'def common_in_nested_lists(lst):\r\n common = []\r\n for i in range(len(lst)):\r\n for j in range(len(lst[i])):\r\n if lst[i][j] not in common:\r\n common.append(lst[i][j])\r\n return common'

output[task_id-11].append(completion)
print("save to {}".format(args.out_path))
# NOTE 核心逻辑就是,只抽取最纯洁的代码部分,放入jsonl文件中。
print("save to {}".format(args.out_path)) # save to preds/MBPP_T0.2_N2.jsonl
print(a)
with open(args.out_path, "w", encoding="utf-8") as fout:
json.dump(output, fout)
json.dump(output, fout)

0 comments on commit 5e735d8

Please sign in to comment.