Skip to content

Commit

Permalink
changes and merge by hand
Browse files Browse the repository at this point in the history
  • Loading branch information
Xianchao-Wu committed Aug 10, 2023
2 parents 8c4cc7e + 82d258b commit f687164
Show file tree
Hide file tree
Showing 8 changed files with 176 additions and 77 deletions.
Binary file removed WizardCoder/data/humaneval.59.8.gen.zip
Binary file not shown.
Binary file removed WizardCoder/data/mbpp.test.zip
Binary file not shown.
14 changes: 14 additions & 0 deletions WizardCoder/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

import os
cdir=os.getcwd()
print(cdir)

tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardCoder-15B-V1.0", cache_dir=cdir)
print(tokenizer)

model = AutoModelForCausalLM.from_pretrained("WizardLM/WizardCoder-15B-V1.0", cache_dir=cdir)
print(model)
print(sum(p.numel() for p in model.parameters()))
16 changes: 9 additions & 7 deletions WizardCoder/inf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,17 @@
# File Name: inf.sh
# Author: Xianchao Wu
# mail: [email protected]
# Created Time: Mon Jul 3 04:13:18 2023
# Created Time: Fri Jul 28 07:55:20 2023
#########################################################################
#!/bin/bash

ckpt="/workspace/asr/Llama-X/src/checkpoints_wcode/models--WizardLM--WizardCoder-15B-V1.0/snapshots/69e87732535159460155972c3fac32a6241cc0ca"
indata="/workspace/asr/WizardLM/WizardCoder/data/in.data.jsonl"
outdata="/workspace/asr/WizardLM/WizardCoder/data/out.res.jsonl"
bmodel="/workspace/asr/WizardLM/WizardCoder/models--WizardLM--WizardCoder-15B-V1.0/snapshots/926ca1b215c4
631bc5f8c3e47173381452c23e5c"
inpath="./data/in.jsonl"
outpath="./data/out.jsonl"

python -m ipdb src/inference_wizardcoder.py \
--base_model $ckpt \
--input_data_path $indata \
--output_data_path $outdata
--base_model $bmodel \
--input_data_path $inpath \
--output_data_path $outpath

17 changes: 17 additions & 0 deletions WizardCoder/inf_a100.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#########################################################################
# File Name: inf.sh
# Author: Xianchao Wu
# mail: [email protected]
# Created Time: Mon Jul 3 04:13:18 2023
#########################################################################
#!/bin/bash

ckpt="/workspace/asr/Llama-X/src/checkpoints_wcode/models--WizardLM--WizardCoder-15B-V1.0/snapshots/69e87732535159460155972c3fac32a6241cc0ca"
inpath="./data/in.jsonl"
outpath="./data/out.jsonl"

python -m ipdb src/inference_wizardcoder.py \
--base_model $ckpt \
--input_data_path $inpath \
--output_data_path $outpath

66 changes: 66 additions & 0 deletions WizardCoder/mbpp_1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#########################################################################
# File Name: mbpp_1.sh
# Author: Xianchao Wu
# mail: [email protected]
# Created Time: Thu Aug 10 06:39:18 2023
#########################################################################
#!/bin/bash

#model="/path/to/your/model"
model="/workspace/asr/WizardLM/WizardCoder/models--WizardLM--WizardCoder-15B-V1.0/snapshots/926ca1b215c4631bc5f8c3e47173381452c23e5c"
temp=0.2 # temperature, TODO reset this 温度
max_len=2048
pred_num=200
num_seqs_per_iter=2

output_path=preds/MBPP_T${temp}_N${pred_num}
mbpp_path=data/mbpp.test.jsonl # we provide this file in data/mbpp.test.zip

mkdir -p ${output_path}
echo 'Output path: '$output_path
echo 'Model to eval: '$model

# for debug NOTE
debug=0
if [[ $debug == 1 ]]
then
gpu=1
start_index=0
end_index=2

CUDA_VISIBLE_DEVICES=$gpu python -m ipdb src/mbpp_gen.py --model ${model} \
--start_index ${start_index} \
--end_index ${end_index} \
--temperature ${temp} \
--num_seqs_per_iter ${num_seqs_per_iter} \
--N ${pred_num} \
--max_len ${max_len} \
--output_path ${output_path} \
--mbpp_path ${mbpp_path}

exit 0
fi

# 500 problems, 63 per GPU if GPU=8
index=0
gpu_num=8
for ((i = 0; i < $gpu_num; i++)); do
start_index=$((i * 50))
end_index=$(((i + 1) * 50))

gpu=$((i))
echo 'Running process #' ${i} 'from' $start_index 'to' $end_index 'on GPU' ${gpu}
((index++))
(
CUDA_VISIBLE_DEVICES=$gpu python src/mbpp_gen.py --model ${model} \
--start_index ${start_index} \
--end_index ${end_index} \
--temperature ${temp} \
--num_seqs_per_iter ${num_seqs_per_iter} \
--N ${pred_num} \
--max_len ${max_len} \
--output_path ${output_path} \
--mbpp_path ${mbpp_path}
) &
if (($index % $gpu_num == 0)); then wait; fi
done
76 changes: 38 additions & 38 deletions WizardCoder/src/inference_wizardcoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,73 +15,73 @@

try:
if torch.backends.mps.is_available():
device = "mps"
device = "mps" # metal performance shaders (MPS) 苹果Apple公司的MPS作为pytorch的后端
except:
pass

def evaluate(
batch_data,
tokenizer,
model,
batch_data, # 'Write a Python code to count 1 to 10.'
tokenizer, # GPT2TokenizerFast(name_or_path='/workspace/asr/WizardLM/WizardCoder/models--WizardLM--WizardCoder-15B-V1.0/snapshots/926ca1b215c4631bc5f8c3e47173381452c23e5c', vocab_size=49152, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '[PAD]', 'additional_special_tokens': ['<|endoftext|>', '<fim_prefix>', '<fim_middle>', '<fim_suffix>', '<fim_pad>', '<filename>', '<gh_stars>', '<issue_start>', '<issue_comment>', '<issue_closed>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<empty_output>', '<commit_before>', '<commit_msg>', '<commit_after>', '<reponame>']}, clean_up_tokenization_spaces=True)
model, # type(model)=<class 'transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM'>
input=None,
temperature=1,
top_p=0.9,
top_k=40,
num_beams=1,
max_new_tokens=2048,
**kwargs,
**kwargs, # {}
):
prompts = generate_prompt(batch_data, input)
inputs = tokenizer(prompts, return_tensors="pt", max_length=256, truncation=True, padding=True)
inputs = tokenizer(prompts, return_tensors="pt", max_length=256, truncation=True, padding=True) # ipdb> p inputs: {'input_ids': tensor([[27400, 438, 600, 12404, 688, 18872, 312, 2899, 32, 5950, 312, 1789, 688, 36808, 30772, 322, 1326, 32, 203, 203, 1482, 21081, 44, 203, 2538, 312, 4865, 1340, 372, 2385, 225, 35, 372, 225, 35, 34, 32, 203, 203, 1482, 5170, 44]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])} ipdb> inputs['input_ids'].shape = torch.Size([1, 42]), ipdb> inputs['attention_mask'].shape = torch.Size([1, 42])
input_ids = inputs["input_ids"].to(device)
generation_config = GenerationConfig(
temperature=temperature,
top_p=top_p,
top_k=top_k,
num_beams=num_beams,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
**kwargs,
temperature=temperature, # 1
top_p=top_p, # 0.9
top_k=top_k, # 40
num_beams=num_beams, # 1
eos_token_id=tokenizer.eos_token_id, # 0
pad_token_id=tokenizer.pad_token_id, # 49152
**kwargs, # {}
)
import ipdb; ipdb.set_trace()
with torch.no_grad():
generation_output = model.generate(
input_ids=input_ids,
generation_output = model.generate( # NOTE, > /opt/conda/lib/python3.8/site-packages/transformers/generation/utils.py(1160)generate()
input_ids=input_ids, # [1, 42], batch-size=1
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=max_new_tokens,
max_new_tokens=max_new_tokens, # 2048
)
s = generation_output.sequences
s = generation_output.sequences # size=[1, 220], alike=tensor([[27400, 438, 600, 12404, 688, 18872, 312, 2899, 32, 5950, ...]], device='cuda:0')
output = tokenizer.batch_decode(s, skip_special_tokens=True)
return output
return output # ["Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWrite a Python code to count 1 to 10.\n\n### Response:Here's the Python code to count 1 to 10:\r\n\r\n```python\r\nfor i in range(1, 11):\r\n print(i)\r\n```\r\n\r\nOutput:\r\n\r\n```\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\n7\r\n8\r\n9\r\n10\r\n```\r\n\r\nExplanation:\r\n\r\n- The `range()` function generates a sequence of numbers from the starting value (inclusive) to the ending value (exclusive).\r\n- In this case, we start with 1 and go up to 11 (exclusive) because we want to count 10.\r\n- The `for` loop iterates over each number in the sequence and assigns it to the variable `i`.\r\n- The `print()` function outputs the value of `i` on a new line."] NOTE 可以看到这里的输出output,也包括了最初的输入的prompt和instruction.


def generate_prompt(instruction, input=None):
def generate_prompt(instruction, input=None): # NOTE input这个参数没有被使用到
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:"""
### Response:""" # out = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWrite a Python code to count 1 to 10.\n\n### Response:'


def main(
load_8bit: bool = False,
base_model: str = "Model_Path",
input_data_path = "Input.jsonl",
output_data_path = "Output.jsonl",
load_8bit: bool = False, # False
base_model: str = "Model_Path", # '/workspace/asr/WizardLM/WizardCoder/models--WizardLM--WizardCoder-15B-V1.0/snapshots/926ca1b215c4631bc5f8c3e47173381452c23e5c'
input_data_path = "Input.jsonl", # './data/in.jsonl'
output_data_path = "Output.jsonl", # './data/out.jsonl'
):
assert base_model, (
"Please specify a --base_model, e.g. --base_model='bigcode/starcoder'"
"Please specify a --base_model, e.g. --base_model='bigcode/starcoder'" # NOTE 这里支持具体的model name或者本地的model path
)

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer = AutoTokenizer.from_pretrained(base_model) # GPT2TokenizerFast(name_or_path='/workspace/asr/WizardLM/WizardCoder/models--WizardLM--WizardCoder-15B-V1.0/snapshots/926ca1b215c4631bc5f8c3e47173381452c23e5c', vocab_size=49152, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '[PAD]', 'additional_special_tokens': ['<|endoftext|>', '<fim_prefix>', '<fim_middle>', '<fim_suffix>', '<fim_pad>', '<filename>', '<gh_stars>', '<issue_start>', '<issue_comment>', '<issue_closed>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<empty_output>', '<commit_before>', '<commit_msg>', '<commit_after>', '<reponame>']}, clean_up_tokenization_spaces=True)
if device == "cuda":
model = AutoModelForCausalLM.from_pretrained(
base_model,
load_in_8bit=load_8bit,
torch_dtype=torch.float16,
load_in_8bit=load_8bit, # False
torch_dtype=torch.float16, # TODO 应该通过命令行管理
device_map="auto",
)
elif device == "mps":
Expand All @@ -91,31 +91,31 @@ def main(
torch_dtype=torch.float16,
)

model.config.pad_token_id = tokenizer.pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id # 49152

if not load_8bit:
model.half()
model.half() # NOTE in, 本来就是float16导入的,这个half()没有导致什么变化...

model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
if torch.__version__ >= "2" and sys.platform != "win32": # '1.13.0+cu116', 'linux'; so not in
model = torch.compile(model)

input_data = jsonlines.open(input_data_path, mode='r')
output_data = jsonlines.open(output_data_path, mode='w')

for num, line in enumerate(input_data):
one_data = line
one_data = line # {'idx': 11, 'Instruction': 'Write a Python code to count 1 to 10.'}
id = one_data["idx"]
instruction = one_data["Instruction"]
print(instruction)
_output = evaluate(instruction, tokenizer, model)
final_output = _output[0].split("### Response:")[1].strip()
_output = evaluate(instruction, tokenizer, model) # NOTE
final_output = _output[0].split("### Response:")[1].strip() # 因为这里是逐行循环的,所以_output中只可能有一个元素! NOTE final_output="Here's the Python code to count 1 to 10:\r\n\r\n```python\r\nfor i in range(1, 11):\r\n print(i)\r\n```\r\n\r\nOutput:\r\n\r\n```\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\n7\r\n8\r\n9\r\n10\r\n```\r\n\r\nExplanation:\r\n\r\n- The `range()` function generates a sequence of numbers from the starting value (inclusive) to the ending value (exclusive).\r\n- In this case, we start with 1 and go up to 11 (exclusive) because we want to count 10.\r\n- The `for` loop iterates over each number in the sequence and assigns it to the variable `i`.\r\n- The `print()` function outputs the value of `i` on a new line."
new_data = {
"id": id,
"instruction": instruction,
"wizardcoder": final_output
"id": id, # 11
"instruction": instruction, # 'Write a Python code to count 1 to 10.'
"wizardcoder": final_output # "Here's the Python code to count 1 to 10:\r\n\r\n```python\r\nfor i in range(1, 11):\r\n print(i)\r\n```\r\n\r\nOutput:\r\n\r\n```\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\n7\r\n8\r\n9\r\n10\r\n```\r\n\r\nExplanation:\r\n\r\n- The `range()` function generates a sequence of numbers from the starting value (inclusive) to the ending value (exclusive).\r\n- In this case, we start with 1 and go up to 11 (exclusive) because we want to count 10.\r\n- The `for` loop iterates over each number in the sequence and assigns it to the variable `i`.\r\n- The `print()` function outputs the value of `i` on a new line."
}
output_data.write(new_data)
output_data.write(new_data) # {'id': 11, 'instruction': 'Write a Python code to count 1 to 10.', 'wizardcoder': "Here's the Python code to count 1 to 10:\r\n\r\n```python\r\nfor i in range(1, 11):\r\n print(i)\r\n```\r\n\r\nOutput:\r\n\r\n```\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\n7\r\n8\r\n9\r\n10\r\n```\r\n\r\nExplanation:\r\n\r\n- The `range()` function generates a sequence of numbers from the starting value (inclusive) to the ending value (exclusive).\r\n- In this case, we start with 1 and go up to 11 (exclusive) because we want to count 10.\r\n- The `for` loop iterates over each number in the sequence and assigns it to the variable `i`.\r\n- The `print()` function outputs the value of `i` on a new line."}


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit f687164

Please sign in to comment.