diff --git a/WizardCoder/humaneval_1_a100.sh b/WizardCoder/humaneval_1_a100.sh new file mode 100644 index 0000000..8310edc --- /dev/null +++ b/WizardCoder/humaneval_1_a100.sh @@ -0,0 +1,45 @@ +######################################################################### +# File Name: humaneval_1_dgx1.sh +# Author: Xianchao Wu +# mail: xianchaow@nvidia.com +# Created Time: Wed Aug 16 08:45:55 2023 +######################################################################### +#!/bin/bash + +#model="/path/to/your/model" +#model="/workspace/asr/WizardLM/WizardCoder/models--WizardLM--WizardCoder-15B-V1.0/snapshots/926ca1b215c4631bc5f8c3e47173381452c23e5c" + +model="/workspace/asr/Llama-X/src/checkpoints_wcode/models--WizardLM--WizardCoder-15B-V1.0/snapshots/69e87732535159460155972c3fac32a6241cc0ca" +temp=0.2 +max_len=2048 +pred_num=20 #200 +num_seqs_per_iter=2 #2 + +output_path=preds/humaneval_T${temp}_N${pred_num}_S${num_seqs_per_iter} + +mkdir -p ${output_path} +echo 'Output path: '$output_path +echo 'Model to eval: '$model + +# 164 problems, 21 per GPU if GPU=8 +index=0 +gpu_num=8 +for ((i = 0; i < $gpu_num; i++)); do + start_index=$((i * 21)) + end_index=$(((i + 1) * 21)) + + gpu=$((i)) + echo 'Running process #' ${i} 'from' $start_index 'to' $end_index 'on GPU' ${gpu} + ((index++)) + ( + CUDA_VISIBLE_DEVICES=$gpu python src/humaneval_gen.py --model ${model} \ + --start_index ${start_index} \ + --end_index ${end_index} \ + --temperature ${temp} \ + --num_seqs_per_iter ${num_seqs_per_iter} \ + --N ${pred_num} \ + --max_len ${max_len} \ + --output_path ${output_path} + ) & + if (($index % $gpu_num == 0)); then wait; fi +done diff --git a/WizardCoder/humaneval_2_a100.sh b/WizardCoder/humaneval_2_a100.sh new file mode 100644 index 0000000..695805f --- /dev/null +++ b/WizardCoder/humaneval_2_a100.sh @@ -0,0 +1,19 @@ +######################################################################### +# File Name: humaneval_2_a100.sh +# Author: Xianchao Wu +# mail: xianchaow@nvidia.com +# Created Time: Thu Aug 17 23:05:32 2023 +######################################################################### +#!/bin/bash + +temp=0.2 +pred_num=1 + +output_path=preds/humaneval_T${temp}_N${pred_num} + +echo 'Output path: '$output_path +python src/process_humaneval.py --path ${output_path} \ + --out_path ${output_path}.jsonl \ + --add_prompt + +evaluate_functional_correctness ${output_path}.jsonl diff --git a/WizardCoder/humaneval_3_a100.sh b/WizardCoder/humaneval_3_a100.sh new file mode 100644 index 0000000..fd58bdd --- /dev/null +++ b/WizardCoder/humaneval_3_a100.sh @@ -0,0 +1,46 @@ +######################################################################### +# File Name: humaneval_3_a100.sh +# Author: Xianchao Wu +# mail: xianchaow@nvidia.com +# Created Time: Thu Aug 17 23:20:01 2023 +######################################################################### +#!/bin/bash + +#model="WizardLM/WizardCoder-15B-V1.0" +model="/workspace/asr/Llama-X/src/checkpoints_wcode/models--WizardLM--WizardCoder-15B-V1.0/snapshots/69e87732535159460155972c3fac32a6241cc0ca" + +temp=0.0 +max_len=2048 +pred_num=1 +num_seqs_per_iter=1 + +output_path=preds/T${temp}_N${pred_num}_WizardCoder_Greedy_Decode + +mkdir -p ${output_path} +echo 'Output path: '$output_path +echo 'Model to eval: '$model + +# 164 problems, 21 per GPU if GPU=8 +index=0 +gpu_num=8 +for ((i = 0; i < $gpu_num; i++)); do + start_index=$((i * 21)) + end_index=$(((i + 1) * 21)) + + gpu=$((i)) + echo 'Running process #' ${i} 'from' $start_index 'to' $end_index 'on GPU' ${gpu} + ((index++)) + ( + CUDA_VISIBLE_DEVICES=$gpu python src/humaneval_gen.py \ + --model ${model} \ + --start_index ${start_index} \ + --end_index ${end_index} \ + --temperature ${temp} \ + --num_seqs_per_iter ${num_seqs_per_iter} \ + --N ${pred_num} \ + --max_len ${max_len} \ + --output_path ${output_path} \ + --greedy_decode + ) & + if (($index % $gpu_num == 0)); then wait; fi +done diff --git a/WizardCoder/humaneval_4_a100.sh b/WizardCoder/humaneval_4_a100.sh new file mode 100644 index 0000000..5d4e3a4 --- /dev/null +++ b/WizardCoder/humaneval_4_a100.sh @@ -0,0 +1,21 @@ +######################################################################### +# File Name: humaneval_2_a100.sh +# Author: Xianchao Wu +# mail: xianchaow@nvidia.com +# Created Time: Thu Aug 17 23:05:32 2023 +######################################################################### +#!/bin/bash + +temp=0.0 +pred_num=1 + +#output_path=preds/humaneval_T${temp}_N${pred_num} + +output_path=preds/T${temp}_N${pred_num}_WizardCoder_Greedy_Decode + +echo 'Output path: '$output_path +python src/process_humaneval.py --path ${output_path} \ + --out_path ${output_path}.jsonl \ + --add_prompt + +evaluate_functional_correctness ${output_path}.jsonl diff --git a/WizardCoder/src/process_humaneval.py b/WizardCoder/src/process_humaneval.py index 1023a09..52f9f6d 100644 --- a/WizardCoder/src/process_humaneval.py +++ b/WizardCoder/src/process_humaneval.py @@ -1,3 +1,7 @@ +import sys + +sys.path.append('/workspace/asr/WizardLM/WizardCoder/human-eval') + from human_eval.data import read_problems, write_jsonl, stream_jsonl import glob from tqdm import tqdm @@ -66,4 +70,4 @@ print("save to {}".format(args.out_path)) write_jsonl(args.out_path, output) -print(a) \ No newline at end of file +print(a) diff --git a/WizardCoder/src/train_wizardcoder.py b/WizardCoder/src/train_wizardcoder.py index 245a9ef..04bbdc2 100644 --- a/WizardCoder/src/train_wizardcoder.py +++ b/WizardCoder/src/train_wizardcoder.py @@ -26,6 +26,7 @@ from datasets import load_dataset import utils +import ipdb; ipdb.set_trace() IGNORE_INDEX = -100 DEFAULT_PAD_TOKEN = "[PAD]" DEFAULT_EOS_TOKEN = "<|endoftext|>" @@ -43,7 +44,7 @@ "### Instruction:\n{instruction}\n\n### Response:" ), } - +# {'prompt_input': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:', 'prompt_no_input': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:'} NOTE @dataclass class ModelArguments: @@ -96,9 +97,9 @@ def smart_tokenizer_and_embedding_resize( input_embeddings[-num_new_tokens:] = input_embeddings_avg output_embeddings[-num_new_tokens:] = output_embeddings_avg - def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict: """Tokenize a list of strings.""" + #import ipdb; ipdb.set_trace() tokenized_list = [ tokenizer( text, @@ -112,12 +113,12 @@ def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedToken input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list] input_ids_lens = labels_lens = [ tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list - ] + ] # tokenizer.pad_token_id = 49152 return dict( - input_ids=input_ids, - labels=labels, - input_ids_lens=input_ids_lens, - labels_lens=labels_lens, + input_ids=input_ids, # [tensor([27400, 438, 600, 12404, 688, 18872, 312, 2899, 32, 5950, ... ])] + labels=labels, # [tensor([27400, 438, 600, 12404, 688, 18872, 312, 2899, 32, 5950, ...])], + input_ids_lens=input_ids_lens, # [68] + labels_lens=labels_lens, # [68] NOTE ) @@ -127,15 +128,16 @@ def preprocess( tokenizer: transformers.PreTrainedTokenizer, ) -> Dict: """Preprocess the data by tokenizing.""" - examples = [s + t for s, t in zip(sources, targets)] + #import ipdb; ipdb.set_trace() + examples = [s + t for s, t in zip(sources, targets)] # ['Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate an array of length 5 which contains all even numbers between 1 and 10.\n\n### Response:arr = [2, 4, 6, 8, 10]<|endoftext|>'] examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)] input_ids = examples_tokenized["input_ids"] labels = copy.deepcopy(input_ids) for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]): - label[:source_len] = IGNORE_INDEX - return dict(input_ids=input_ids, labels=labels) - - + label[:source_len] = IGNORE_INDEX # label[:49] = -100, 前面的49个tokens是pre-given condition,不是参与到label (target)中去. NOTE 这里是直接修改了label,连动着,labels也被修改了 NOTE 这个非常重要. + return dict(input_ids=input_ids, labels=labels) # + # input_ids = [tensor([27400, 438, 600, 12404, 688, 18872, 312, 2899, 32, 5950, ... ])] + # labels = [tensor([-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, ... -100, 846, 280, 428, 36, 30, 225, 38, 30, 225, 40, 30, 225, 42, 30, 225, 35, 34, 79, 0])], 一共是49个-100,省下的是真正的labels,一共长度是:68 - 49 = 19 个tokens @dataclass class DataCollatorForSupervisedDataset(object): """Collate examples for supervised fine-tuning.""" @@ -156,70 +158,75 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: attention_mask=input_ids.ne(self.tokenizer.pad_token_id), ) -def train_tokenize_function(examples, tokenizer): +def train_tokenize_function(examples, tokenizer): # examples = {'input': [''], 'output': ['arr = [2, 4, 6, 8, 10]'], 'instruction': ['Create an array of length 5 which contains all even numbers between 1 and 10.']}; + #import ipdb; ipdb.set_trace() prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"] if 'input' in examples: sources = [ prompt_input.format_map(dict(instruction=instruction, input=input)) if input != "" \ else prompt_no_input.format_map(dict(instruction=instruction)) \ for instruction, input in zip(examples['instruction'], examples['input']) - ] + ] # sources = ['Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate an array of length 5 which contains all even numbers between 1 and 10.\n\n### Response:'] else: sources = [ prompt_no_input.format_map(dict(instruction=instruction)) \ for instruction in examples['instruction'] - ] - targets = [f"{output}{tokenizer.eos_token}" for output in examples['output']] + ] + targets = [f"{output}{tokenizer.eos_token}" for output in examples['output']] # ['arr = [2, 4, 6, 8, 10]<|endoftext|>'] data_dict = preprocess(sources, targets, tokenizer) - return data_dict - + return data_dict # {'input_ids': [tensor], 'labels': [tensor]} def train(): + import ipdb; ipdb.set_trace() parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() - + #cache_dir="/workspace/asr/WizardLM/WizardCoder" + import ipdb; ipdb.set_trace() model = transformers.AutoModelForCausalLM.from_pretrained( model_args.model_name_or_path, cache_dir=training_args.cache_dir, + use_auth_token=True, ) - + import ipdb; ipdb.set_trace() tokenizer = transformers.AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=training_args.cache_dir, - model_max_length=training_args.model_max_length, + model_max_length=training_args.model_max_length, # 2048 padding_side="right", use_fast=True, - ) + use_auth_token=True, + ) # GPT2TokenizerFast(name_or_path='bigcode/starcoder', vocab_size=49152, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']}, clean_up_tokenization_spaces=True) if tokenizer.pad_token is None: smart_tokenizer_and_embedding_resize( - special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN), + special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN), # '[PAD]' -> {'pad_token': '[PAD]'} tokenizer=tokenizer, - model=model, + model=model, # ) if "starcoder" in model_args.model_name_or_path: tokenizer.add_special_tokens( { - "eos_token": DEFAULT_EOS_TOKEN, - "bos_token": DEFAULT_BOS_TOKEN, - "unk_token": DEFAULT_UNK_TOKEN, - "pad_token": DEFAULT_PAD_TOKEN, + "eos_token": DEFAULT_EOS_TOKEN, # '<|endoftext|>' + "bos_token": DEFAULT_BOS_TOKEN, # '<|endoftext|>' + "unk_token": DEFAULT_UNK_TOKEN, # '<|endoftext|>' + "pad_token": DEFAULT_PAD_TOKEN, # '[PAD]' } ) - raw_train_datasets = load_dataset('json', data_files=data_args.data_path, split="train", cache_dir=training_args.cache_dir) - if training_args.local_rank > 0: + raw_train_datasets = load_dataset('json', data_files=data_args.data_path, split="train", cache_dir=training_args.cache_dir) # '/workspace/asr/WizardLM/WizardCoder/data/code_alpaca_20k.json'; Dataset({features: ['input', 'output', 'instruction'], num_rows: 20022}) NOTE + if training_args.local_rank > 0: # = 0 NOTE torch.distributed.barrier() - - train_dataset = raw_train_datasets.map( + # = type(row_train_datasets) + train_dataset = raw_train_datasets.map( # NOTE 这个很赞啊! train_tokenize_function, batched=True, - batch_size=3000, - num_proc=32, - remove_columns=raw_train_datasets.column_names, + batch_size=1, # 3000 + num_proc=1, # TODO 32 + remove_columns=raw_train_datasets.column_names, # ['input', 'output', 'instruction'] load_from_cache_file=True, # not args.overwrite_cache desc="Running tokenizer on train dataset", fn_kwargs={"tokenizer": tokenizer} ) + import ipdb; ipdb.set_trace() if training_args.local_rank == 0: torch.distributed.barrier() @@ -245,4 +252,4 @@ def train(): if __name__ == "__main__": - train() \ No newline at end of file + train() diff --git a/WizardCoder/trainwcoder.sh b/WizardCoder/trainwcoder.sh new file mode 100644 index 0000000..bb0e832 --- /dev/null +++ b/WizardCoder/trainwcoder.sh @@ -0,0 +1,38 @@ +######################################################################### +# File Name: trainwcoder.sh +# Author: Xianchao Wu +# mail: xianchaow@nvidia.com +# Created Time: Fri Aug 18 08:03:48 2023 +######################################################################### +#!/bin/bash + +#!/bin/bash + +data="/workspace/asr/WizardLM/WizardCoder/data/code_alpaca_20k.json" +#outdir="/workspace/asr/Llama-X/src/checkpoints_wcode" +outdir="/workspace/asr/WizardLM/WizardCoder/ckpts" + +#deepspeed src/train_wizardcoder.py \ +python -m ipdb src/train_wizardcoder.py \ + --model_name_or_path "bigcode/starcoder" \ + --data_path $data \ + --output_dir $outdir \ + --cache_dir "/workspace/asr/WizardLM/WizardCoder" \ + --num_train_epochs 3 \ + --model_max_length 2048 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 50 \ + --save_total_limit 2 \ + --learning_rate 2e-5 \ + --warmup_steps 30 \ + --logging_steps 2 \ + --lr_scheduler_type "cosine" \ + --report_to "tensorboard" \ + --gradient_checkpointing True \ + --deepspeed configs/deepspeed_config.json \ + --fp16 True +