mbpp code analysis

Xianchao-Wu · Aug 16, 2023 · 5e735d8 · 5e735d8
1 parent 1de0298
commit 5e735d8
Show file tree

Hide file tree

Showing 6 changed files with 131 additions and 11 deletions.
diff --git a/WizardCoder/bigcode_install.sh b/WizardCoder/bigcode_install.sh
@@ -0,0 +1,11 @@
+#########################################################################
+# File Name: bigcode.sh
+# Author: Xianchao Wu
+# mail: [email protected]
+# Created Time: Tue Aug 15 07:29:09 2023
+#########################################################################
+#!/bin/bash
+
+git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git
+cd bigcode-evaluation-harness
+pip install -e .
diff --git a/WizardCoder/mbpp_1_a100.sh b/WizardCoder/mbpp_1_a100.sh
@@ -0,0 +1,67 @@
+#########################################################################
+# File Name: mbpp_1.sh
+# Author: Xianchao Wu
+# mail: [email protected]
+# Created Time: Thu Aug 10 06:39:18 2023
+#########################################################################
+#!/bin/bash
+
+#model="/path/to/your/model"
+#model="/workspace/asr/WizardLM/WizardCoder/models--WizardLM--WizardCoder-15B-V1.0/snapshots/926ca1b215c4631bc5f8c3e47173381452c23e5c"
+model="/workspace/asr/Llama-X/src/checkpoints_wcode/models--WizardLM--WizardCoder-15B-V1.0/snapshots/69e87732535159460155972c3fac32a6241cc0ca"
+temp=0.2 # temperature, TODO reset this 温度
+max_len=2048
+pred_num=2 #200
+num_seqs_per_iter=1 #2
+
+output_path=preds/MBPP_T${temp}_N${pred_num}
+mbpp_path=data/mbpp.test.jsonl # we provide this file in data/mbpp.test.zip
+
+mkdir -p ${output_path}
+echo 'Output path: '$output_path
+echo 'Model to eval: '$model
+
+# for debug NOTE
+debug=0
+if [[ $debug == 1 ]]
+then
+    gpu=1
+    start_index=0
+    end_index=2
+
+    CUDA_VISIBLE_DEVICES=$gpu python -m ipdb src/mbpp_gen.py --model ${model} \
+      --start_index ${start_index} \
+      --end_index ${end_index} \
+      --temperature ${temp} \
+      --num_seqs_per_iter ${num_seqs_per_iter} \
+      --N ${pred_num} \
+      --max_len ${max_len} \
+      --output_path ${output_path} \
+      --mbpp_path ${mbpp_path}
+
+    exit 0
+fi
+
+# 500 problems, 63 per GPU if GPU=8
+index=0
+gpu_num=8
+for ((i = 0; i < $gpu_num; i++)); do
+  start_index=$((i * 50))
+  end_index=$(((i + 1) * 50))
+
+  gpu=$((i))
+  echo 'Running process #' ${i} 'from' $start_index 'to' $end_index 'on GPU' ${gpu}
+  ((index++))
+  (
+    CUDA_VISIBLE_DEVICES=$gpu python src/mbpp_gen.py --model ${model} \
+      --start_index ${start_index} \
+      --end_index ${end_index} \
+      --temperature ${temp} \
+      --num_seqs_per_iter ${num_seqs_per_iter} \
+      --N ${pred_num} \
+      --max_len ${max_len} \
+      --output_path ${output_path} \
+      --mbpp_path ${mbpp_path}
+  ) &
+  if (($index % $gpu_num == 0)); then wait; fi
+done
diff --git a/WizardCoder/mbpp_2_a100.sh b/WizardCoder/mbpp_2_a100.sh
@@ -0,0 +1,22 @@
+#########################################################################
+# File Name: mbpp_2_a100.sh
+# Author: Xianchao Wu
+# mail: [email protected]
+# Created Time: Tue Aug 15 06:47:10 2023
+#########################################################################
+#!/bin/bash
+
+temp=0.2
+pred_num=2
+
+output_path=preds/MBPP_T${temp}_N${pred_num}
+echo ${output_path}
+
+mbpp_path=data/mbpp.test.jsonl # we provide this file in data/mbpp.test.zip
+
+echo 'Output path: '$output_path
+
+python -m ipdb src/process_mbpp.py --path ${output_path} \
+	--out_path ${output_path}.jsonl \
+	--mbpp_path ${mbpp_path} \
+	--add_prompt
diff --git a/WizardCoder/mbpp_3_a100.sh b/WizardCoder/mbpp_3_a100.sh
@@ -0,0 +1,15 @@
+#########################################################################
+# File Name: mbpp_3_a100.sh
+# Author: Xianchao Wu
+# mail: [email protected]
+# Created Time: Tue Aug 15 07:29:53 2023
+#########################################################################
+#!/bin/bash
+
+jsonfn="/workspace/asr/WizardLM/WizardCoder/preds/MBPP_T0.2_N2.jsonl"
+
+#accelerate launch  main.py   --tasks mbpp \
+python -m ipdb main.py --tasks mbpp \
+	--allow_code_execution \
+	--load_generations_path $jsonfn \
+	--model incoder-temperature-08
diff --git a/WizardCoder/src/mbpp_gen.py b/WizardCoder/src/mbpp_gen.py
@@ -2,6 +2,7 @@
 import argparse
 import pprint
 import sys
+sys.path.append('/workspace/asr/WizardLM/WizardCoder/human-eval')
 import os
 import re
 from tqdm import tqdm

diff --git a/WizardCoder/src/process_mbpp.py b/WizardCoder/src/process_mbpp.py
@@ -1,3 +1,7 @@
+import sys
+
+sys.path.append('/workspace/asr/WizardLM/WizardCoder/human-eval')
+
 from human_eval.data import stream_jsonl
 import glob 
 from tqdm import tqdm
@@ -30,27 +34,27 @@ def read_mbpp(path):
 parser.add_argument('--mbpp_path', type=str, help="")
 
 args = parser.parse_args()
-
+# Namespace(add_prompt=True, mbpp_path='data/mbpp.test.jsonl', out_path='preds/MBPP_T0.2_N2.jsonl', path='preds/MBPP_T0.2_N2')
 
 files = sorted(glob.glob(args.path + '/*.jsonl'))
 print("{} files in {}".format(len(files), args.path))
 
 problems = read_mbpp(args.mbpp_path)
 output = [[] for _ in range(len(problems))]
 a = 0
-for code_file in tqdm(files, total=len(files)):
+for code_file in tqdm(files, total=len(files)): # e.g., code_file='preds/MBPP_T0.2_N2/0.jsonl'
     codes = [c for c in stream_jsonl(code_file)]
-    if args.add_prompt: 
+    if args.add_prompt: # True, in, NOTE
         for code in codes:
             task_id = code['task_id']
             completion = code['completion']
             if '```python' in completion: 
                 def_line = completion.index('```python')
-                completion = completion[def_line:].strip()
-                completion = completion.replace('```python', '')
+                completion = completion[def_line:].strip() # e.g., '```python\r\ndef remove_Occ(string, char):\r\n    if char not in string:\r\n        return string\r\n    else:\r\n        return string.replace(char, "", 1)[1:-1]\r\n```\r\n\r\nThe function takes two arguments: `string` and `char`. It first checks if the given character is present in the string. If not, it returns the original string. If the character is present, it uses the `replace()` method to remove all occurrences of the character except the first and last one. Finally, it returns the modified string.\r\n\r\nHere are some test cases:\r\n\r\n```python\r\nassert remove_Occ("hello","l") == "heo"\r\nassert remove_Occ("abcda","a") == "bcd"\r\nassert remove_Occ("PHP","P") == "H"\r\n```\r\n\r\nAll test cases pass.'
+                completion = completion.replace('```python', '') # '\r\ndef remove_Occ(string, char):\r\n    if char not in string:\r\n        return string\r\n    else:\r\n        return string.replace(char, "", 1)[1:-1]\r\n```\r\n\r\nThe function takes two arguments: `string` and `char`. It first checks if the given character is present in the string. If not, it returns the original string. If the character is present, it uses the `replace()` method to remove all occurrences of the character except the first and last one. Finally, it returns the modified string.\r\n\r\nHere are some test cases:\r\n\r\n\r\nassert remove_Occ("hello","l") == "heo"\r\nassert remove_Occ("abcda","a") == "bcd"\r\nassert remove_Occ("PHP","P") == "H"\r\n```\r\n\r\nAll test cases pass.'
                 try:
                     next_line = completion.index('\n```')
-                    completion = completion[:next_line].strip()
+                    completion = completion[:next_line].strip() # 'def remove_Occ(string, char):\r\n    if char not in string:\r\n        return string\r\n    else:\r\n        return string.replace(char, "", 1)[1:-1]'
                 except:
                     a += 1
             if "__name__ == \"__main__\"" in completion:
@@ -62,12 +66,12 @@ def read_mbpp(path):
                 completion = completion[:next_line].strip()
 
             if "# Test examples" in completion:
-                next_line = completion.index('# Test examples')
-                completion = completion[:next_line].strip()
+                next_line = completion.index('# Test examples') # NOTE, e.g., 'def common_in_nested_lists(lst):\r\n    common = []\r\n    for i in range(len(lst)):\r\n        for j in range(len(lst[i])):\r\n            if lst[i][j] not in common:\r\n                common.append(lst[i][j])\r\n    return common\r\n\r\n# Test examples\r\nassert common_in_nested_lists([[12, 18, 23, 25, 45], [7, 12, 18, 24, 28], [1, 5, 8, 12, 15, 16, 18]])==[18, 12]\r\nassert common_in_nested_lists([[12, 5, 23, 25, 45], [7, 11, 5, 23, 28], [1, 5, 8, 18, 23, 16]])==[5,23]\r\nassert common_in_nested_lists([[2, 3,4, 1], [4, 5], [6,4, 8],[4, 5], [6, 8,4]])==[4]'
+                completion = completion[:next_line].strip() # NOTE, e.g., -> 'def common_in_nested_lists(lst):\r\n    common = []\r\n    for i in range(len(lst)):\r\n        for j in range(len(lst[i])):\r\n            if lst[i][j] not in common:\r\n                common.append(lst[i][j])\r\n    return common'
 
             output[task_id-11].append(completion)
-    
-print("save to {}".format(args.out_path))
+# NOTE 核心逻辑就是，只抽取最纯洁的代码部分，放入jsonl文件中。 
+print("save to {}".format(args.out_path)) # save to preds/MBPP_T0.2_N2.jsonl
 print(a)
 with open(args.out_path, "w", encoding="utf-8") as fout:
-    json.dump(output, fout)
+    json.dump(output, fout)