Skip to content

Commit

Permalink
debug train wizardcoder
Browse files Browse the repository at this point in the history
  • Loading branch information
Xianchao-Wu committed Aug 19, 2023
1 parent 0475da2 commit 19000ab
Show file tree
Hide file tree
Showing 7 changed files with 217 additions and 37 deletions.
45 changes: 45 additions & 0 deletions WizardCoder/humaneval_1_a100.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#########################################################################
# File Name: humaneval_1_dgx1.sh
# Author: Xianchao Wu
# mail: [email protected]
# Created Time: Wed Aug 16 08:45:55 2023
#########################################################################
#!/bin/bash

#model="/path/to/your/model"
#model="/workspace/asr/WizardLM/WizardCoder/models--WizardLM--WizardCoder-15B-V1.0/snapshots/926ca1b215c4631bc5f8c3e47173381452c23e5c"

model="/workspace/asr/Llama-X/src/checkpoints_wcode/models--WizardLM--WizardCoder-15B-V1.0/snapshots/69e87732535159460155972c3fac32a6241cc0ca"
temp=0.2
max_len=2048
pred_num=20 #200
num_seqs_per_iter=2 #2

output_path=preds/humaneval_T${temp}_N${pred_num}_S${num_seqs_per_iter}

mkdir -p ${output_path}
echo 'Output path: '$output_path
echo 'Model to eval: '$model

# 164 problems, 21 per GPU if GPU=8
index=0
gpu_num=8
for ((i = 0; i < $gpu_num; i++)); do
start_index=$((i * 21))
end_index=$(((i + 1) * 21))

gpu=$((i))
echo 'Running process #' ${i} 'from' $start_index 'to' $end_index 'on GPU' ${gpu}
((index++))
(
CUDA_VISIBLE_DEVICES=$gpu python src/humaneval_gen.py --model ${model} \
--start_index ${start_index} \
--end_index ${end_index} \
--temperature ${temp} \
--num_seqs_per_iter ${num_seqs_per_iter} \
--N ${pred_num} \
--max_len ${max_len} \
--output_path ${output_path}
) &
if (($index % $gpu_num == 0)); then wait; fi
done
19 changes: 19 additions & 0 deletions WizardCoder/humaneval_2_a100.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#########################################################################
# File Name: humaneval_2_a100.sh
# Author: Xianchao Wu
# mail: [email protected]
# Created Time: Thu Aug 17 23:05:32 2023
#########################################################################
#!/bin/bash

temp=0.2
pred_num=1

output_path=preds/humaneval_T${temp}_N${pred_num}

echo 'Output path: '$output_path
python src/process_humaneval.py --path ${output_path} \
--out_path ${output_path}.jsonl \
--add_prompt

evaluate_functional_correctness ${output_path}.jsonl
46 changes: 46 additions & 0 deletions WizardCoder/humaneval_3_a100.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#########################################################################
# File Name: humaneval_3_a100.sh
# Author: Xianchao Wu
# mail: [email protected]
# Created Time: Thu Aug 17 23:20:01 2023
#########################################################################
#!/bin/bash

#model="WizardLM/WizardCoder-15B-V1.0"
model="/workspace/asr/Llama-X/src/checkpoints_wcode/models--WizardLM--WizardCoder-15B-V1.0/snapshots/69e87732535159460155972c3fac32a6241cc0ca"

temp=0.0
max_len=2048
pred_num=1
num_seqs_per_iter=1

output_path=preds/T${temp}_N${pred_num}_WizardCoder_Greedy_Decode

mkdir -p ${output_path}
echo 'Output path: '$output_path
echo 'Model to eval: '$model

# 164 problems, 21 per GPU if GPU=8
index=0
gpu_num=8
for ((i = 0; i < $gpu_num; i++)); do
start_index=$((i * 21))
end_index=$(((i + 1) * 21))

gpu=$((i))
echo 'Running process #' ${i} 'from' $start_index 'to' $end_index 'on GPU' ${gpu}
((index++))
(
CUDA_VISIBLE_DEVICES=$gpu python src/humaneval_gen.py \
--model ${model} \
--start_index ${start_index} \
--end_index ${end_index} \
--temperature ${temp} \
--num_seqs_per_iter ${num_seqs_per_iter} \
--N ${pred_num} \
--max_len ${max_len} \
--output_path ${output_path} \
--greedy_decode
) &
if (($index % $gpu_num == 0)); then wait; fi
done
21 changes: 21 additions & 0 deletions WizardCoder/humaneval_4_a100.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#########################################################################
# File Name: humaneval_2_a100.sh
# Author: Xianchao Wu
# mail: [email protected]
# Created Time: Thu Aug 17 23:05:32 2023
#########################################################################
#!/bin/bash

temp=0.0
pred_num=1

#output_path=preds/humaneval_T${temp}_N${pred_num}

output_path=preds/T${temp}_N${pred_num}_WizardCoder_Greedy_Decode

echo 'Output path: '$output_path
python src/process_humaneval.py --path ${output_path} \
--out_path ${output_path}.jsonl \
--add_prompt

evaluate_functional_correctness ${output_path}.jsonl
6 changes: 5 additions & 1 deletion WizardCoder/src/process_humaneval.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
import sys

sys.path.append('/workspace/asr/WizardLM/WizardCoder/human-eval')

from human_eval.data import read_problems, write_jsonl, stream_jsonl
import glob
from tqdm import tqdm
Expand Down Expand Up @@ -66,4 +70,4 @@

print("save to {}".format(args.out_path))
write_jsonl(args.out_path, output)
print(a)
print(a)
79 changes: 43 additions & 36 deletions WizardCoder/src/train_wizardcoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from datasets import load_dataset
import utils

import ipdb; ipdb.set_trace()
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "<|endoftext|>"
Expand All @@ -43,7 +44,7 @@
"### Instruction:\n{instruction}\n\n### Response:"
),
}

# {'prompt_input': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:', 'prompt_no_input': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:'} NOTE

@dataclass
class ModelArguments:
Expand Down Expand Up @@ -96,9 +97,9 @@ def smart_tokenizer_and_embedding_resize(
input_embeddings[-num_new_tokens:] = input_embeddings_avg
output_embeddings[-num_new_tokens:] = output_embeddings_avg


def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
"""Tokenize a list of strings."""
#import ipdb; ipdb.set_trace()
tokenized_list = [
tokenizer(
text,
Expand All @@ -112,12 +113,12 @@ def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedToken
input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
input_ids_lens = labels_lens = [
tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
]
] # tokenizer.pad_token_id = 49152
return dict(
input_ids=input_ids,
labels=labels,
input_ids_lens=input_ids_lens,
labels_lens=labels_lens,
input_ids=input_ids, # [tensor([27400, 438, 600, 12404, 688, 18872, 312, 2899, 32, 5950, ... ])]
labels=labels, # [tensor([27400, 438, 600, 12404, 688, 18872, 312, 2899, 32, 5950, ...])],
input_ids_lens=input_ids_lens, # [68]
labels_lens=labels_lens, # [68] NOTE
)


Expand All @@ -127,15 +128,16 @@ def preprocess(
tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
"""Preprocess the data by tokenizing."""
examples = [s + t for s, t in zip(sources, targets)]
#import ipdb; ipdb.set_trace()
examples = [s + t for s, t in zip(sources, targets)] # ['Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate an array of length 5 which contains all even numbers between 1 and 10.\n\n### Response:arr = [2, 4, 6, 8, 10]<|endoftext|>']
examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
input_ids = examples_tokenized["input_ids"]
labels = copy.deepcopy(input_ids)
for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
label[:source_len] = IGNORE_INDEX
return dict(input_ids=input_ids, labels=labels)


label[:source_len] = IGNORE_INDEX # label[:49] = -100, 前面的49个tokens是pre-given condition,不是参与到label (target)中去. NOTE 这里是直接修改了label,连动着,labels也被修改了 NOTE 这个非常重要.
return dict(input_ids=input_ids, labels=labels) #
# input_ids = [tensor([27400, 438, 600, 12404, 688, 18872, 312, 2899, 32, 5950, ... ])]
# labels = [tensor([-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, ... -100, 846, 280, 428, 36, 30, 225, 38, 30, 225, 40, 30, 225, 42, 30, 225, 35, 34, 79, 0])], 一共是49个-100,省下的是真正的labels,一共长度是:68 - 49 = 19 个tokens
@dataclass
class DataCollatorForSupervisedDataset(object):
"""Collate examples for supervised fine-tuning."""
Expand All @@ -156,70 +158,75 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
)

def train_tokenize_function(examples, tokenizer):
def train_tokenize_function(examples, tokenizer): # examples = {'input': [''], 'output': ['arr = [2, 4, 6, 8, 10]'], 'instruction': ['Create an array of length 5 which contains all even numbers between 1 and 10.']};
#import ipdb; ipdb.set_trace()
prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
if 'input' in examples:
sources = [
prompt_input.format_map(dict(instruction=instruction, input=input)) if input != "" \
else prompt_no_input.format_map(dict(instruction=instruction)) \
for instruction, input in zip(examples['instruction'], examples['input'])
]
] # sources = ['Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate an array of length 5 which contains all even numbers between 1 and 10.\n\n### Response:']
else:
sources = [
prompt_no_input.format_map(dict(instruction=instruction)) \
for instruction in examples['instruction']
]
targets = [f"{output}{tokenizer.eos_token}" for output in examples['output']]
]
targets = [f"{output}{tokenizer.eos_token}" for output in examples['output']] # ['arr = [2, 4, 6, 8, 10]<|endoftext|>']
data_dict = preprocess(sources, targets, tokenizer)
return data_dict

return data_dict # {'input_ids': [tensor], 'labels': [tensor]}

def train():
import ipdb; ipdb.set_trace()
parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()

#cache_dir="/workspace/asr/WizardLM/WizardCoder"
import ipdb; ipdb.set_trace()
model = transformers.AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
cache_dir=training_args.cache_dir,
use_auth_token=True,
)

import ipdb; ipdb.set_trace()
tokenizer = transformers.AutoTokenizer.from_pretrained(
model_args.model_name_or_path,
cache_dir=training_args.cache_dir,
model_max_length=training_args.model_max_length,
model_max_length=training_args.model_max_length, # 2048
padding_side="right",
use_fast=True,
)
use_auth_token=True,
) # GPT2TokenizerFast(name_or_path='bigcode/starcoder', vocab_size=49152, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<fim_prefix>', '<fim_middle>', '<fim_suffix>', '<fim_pad>', '<filename>', '<gh_stars>', '<issue_start>', '<issue_comment>', '<issue_closed>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<empty_output>', '<commit_before>', '<commit_msg>', '<commit_after>', '<reponame>']}, clean_up_tokenization_spaces=True)
if tokenizer.pad_token is None:
smart_tokenizer_and_embedding_resize(
special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN), # '[PAD]' -> {'pad_token': '[PAD]'}
tokenizer=tokenizer,
model=model,
model=model, # <class 'transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM'><class 'transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM'>
)
if "starcoder" in model_args.model_name_or_path:
tokenizer.add_special_tokens(
{
"eos_token": DEFAULT_EOS_TOKEN,
"bos_token": DEFAULT_BOS_TOKEN,
"unk_token": DEFAULT_UNK_TOKEN,
"pad_token": DEFAULT_PAD_TOKEN,
"eos_token": DEFAULT_EOS_TOKEN, # '<|endoftext|>'
"bos_token": DEFAULT_BOS_TOKEN, # '<|endoftext|>'
"unk_token": DEFAULT_UNK_TOKEN, # '<|endoftext|>'
"pad_token": DEFAULT_PAD_TOKEN, # '[PAD]'
}
)

raw_train_datasets = load_dataset('json', data_files=data_args.data_path, split="train", cache_dir=training_args.cache_dir)
if training_args.local_rank > 0:
raw_train_datasets = load_dataset('json', data_files=data_args.data_path, split="train", cache_dir=training_args.cache_dir) # '/workspace/asr/WizardLM/WizardCoder/data/code_alpaca_20k.json'; Dataset({features: ['input', 'output', 'instruction'], num_rows: 20022}) NOTE
if training_args.local_rank > 0: # = 0 NOTE
torch.distributed.barrier()

train_dataset = raw_train_datasets.map(
# <class 'datasets.arrow_dataset.Dataset'> = type(row_train_datasets)
train_dataset = raw_train_datasets.map( # NOTE 这个很赞啊!
train_tokenize_function,
batched=True,
batch_size=3000,
num_proc=32,
remove_columns=raw_train_datasets.column_names,
batch_size=1, # 3000
num_proc=1, # TODO 32
remove_columns=raw_train_datasets.column_names, # ['input', 'output', 'instruction']
load_from_cache_file=True, # not args.overwrite_cache
desc="Running tokenizer on train dataset",
fn_kwargs={"tokenizer": tokenizer}
)
import ipdb; ipdb.set_trace()

if training_args.local_rank == 0:
torch.distributed.barrier()
Expand All @@ -245,4 +252,4 @@ def train():


if __name__ == "__main__":
train()
train()
38 changes: 38 additions & 0 deletions WizardCoder/trainwcoder.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#########################################################################
# File Name: trainwcoder.sh
# Author: Xianchao Wu
# mail: [email protected]
# Created Time: Fri Aug 18 08:03:48 2023
#########################################################################
#!/bin/bash

#!/bin/bash

data="/workspace/asr/WizardLM/WizardCoder/data/code_alpaca_20k.json"
#outdir="/workspace/asr/Llama-X/src/checkpoints_wcode"
outdir="/workspace/asr/WizardLM/WizardCoder/ckpts"

#deepspeed src/train_wizardcoder.py \
python -m ipdb src/train_wizardcoder.py \
--model_name_or_path "bigcode/starcoder" \
--data_path $data \
--output_dir $outdir \
--cache_dir "/workspace/asr/WizardLM/WizardCoder" \
--num_train_epochs 3 \
--model_max_length 2048 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 50 \
--save_total_limit 2 \
--learning_rate 2e-5 \
--warmup_steps 30 \
--logging_steps 2 \
--lr_scheduler_type "cosine" \
--report_to "tensorboard" \
--gradient_checkpointing True \
--deepspeed configs/deepspeed_config.json \
--fp16 True

0 comments on commit 19000ab

Please sign in to comment.