debug train wizardcoder

Xianchao-Wu · Aug 19, 2023 · 19000ab · 19000ab
1 parent 0475da2
commit 19000ab
Show file tree

Hide file tree

Showing 7 changed files with 217 additions and 37 deletions.
diff --git a/WizardCoder/humaneval_1_a100.sh b/WizardCoder/humaneval_1_a100.sh
@@ -0,0 +1,45 @@
+#########################################################################
+# File Name: humaneval_1_dgx1.sh
+# Author: Xianchao Wu
+# mail: [email protected]
+# Created Time: Wed Aug 16 08:45:55 2023
+#########################################################################
+#!/bin/bash
+
+#model="/path/to/your/model"
+#model="/workspace/asr/WizardLM/WizardCoder/models--WizardLM--WizardCoder-15B-V1.0/snapshots/926ca1b215c4631bc5f8c3e47173381452c23e5c"
+
+model="/workspace/asr/Llama-X/src/checkpoints_wcode/models--WizardLM--WizardCoder-15B-V1.0/snapshots/69e87732535159460155972c3fac32a6241cc0ca"
+temp=0.2
+max_len=2048
+pred_num=20 #200
+num_seqs_per_iter=2 #2
+
+output_path=preds/humaneval_T${temp}_N${pred_num}_S${num_seqs_per_iter}
+
+mkdir -p ${output_path}
+echo 'Output path: '$output_path
+echo 'Model to eval: '$model
+
+# 164 problems, 21 per GPU if GPU=8
+index=0
+gpu_num=8
+for ((i = 0; i < $gpu_num; i++)); do
+  start_index=$((i * 21))
+  end_index=$(((i + 1) * 21))
+
+  gpu=$((i))
+  echo 'Running process #' ${i} 'from' $start_index 'to' $end_index 'on GPU' ${gpu}
+  ((index++))
+  (
+    CUDA_VISIBLE_DEVICES=$gpu python src/humaneval_gen.py --model ${model} \
+      --start_index ${start_index} \
+      --end_index ${end_index} \
+      --temperature ${temp} \
+      --num_seqs_per_iter ${num_seqs_per_iter} \
+      --N ${pred_num} \
+      --max_len ${max_len} \
+      --output_path ${output_path}
+  ) &
+  if (($index % $gpu_num == 0)); then wait; fi
+done
diff --git a/WizardCoder/humaneval_2_a100.sh b/WizardCoder/humaneval_2_a100.sh
@@ -0,0 +1,19 @@
+#########################################################################
+# File Name: humaneval_2_a100.sh
+# Author: Xianchao Wu
+# mail: [email protected]
+# Created Time: Thu Aug 17 23:05:32 2023
+#########################################################################
+#!/bin/bash
+
+temp=0.2
+pred_num=1
+
+output_path=preds/humaneval_T${temp}_N${pred_num}
+
+echo 'Output path: '$output_path
+python src/process_humaneval.py --path ${output_path} \
+	--out_path ${output_path}.jsonl \
+	--add_prompt
+
+evaluate_functional_correctness ${output_path}.jsonl
diff --git a/WizardCoder/humaneval_3_a100.sh b/WizardCoder/humaneval_3_a100.sh
@@ -0,0 +1,46 @@
+#########################################################################
+# File Name: humaneval_3_a100.sh
+# Author: Xianchao Wu
+# mail: [email protected]
+# Created Time: Thu Aug 17 23:20:01 2023
+#########################################################################
+#!/bin/bash
+
+#model="WizardLM/WizardCoder-15B-V1.0"
+model="/workspace/asr/Llama-X/src/checkpoints_wcode/models--WizardLM--WizardCoder-15B-V1.0/snapshots/69e87732535159460155972c3fac32a6241cc0ca"
+
+temp=0.0
+max_len=2048
+pred_num=1
+num_seqs_per_iter=1
+
+output_path=preds/T${temp}_N${pred_num}_WizardCoder_Greedy_Decode
+
+mkdir -p ${output_path}
+echo 'Output path: '$output_path
+echo 'Model to eval: '$model
+
+# 164 problems, 21 per GPU if GPU=8
+index=0
+gpu_num=8
+for ((i = 0; i < $gpu_num; i++)); do
+  start_index=$((i * 21))
+  end_index=$(((i + 1) * 21))
+
+  gpu=$((i))
+  echo 'Running process #' ${i} 'from' $start_index 'to' $end_index 'on GPU' ${gpu}
+  ((index++))
+  (
+    CUDA_VISIBLE_DEVICES=$gpu python src/humaneval_gen.py \
+      --model ${model} \
+      --start_index ${start_index} \
+      --end_index ${end_index} \
+      --temperature ${temp} \
+      --num_seqs_per_iter ${num_seqs_per_iter} \
+      --N ${pred_num} \
+      --max_len ${max_len} \
+      --output_path ${output_path} \
+      --greedy_decode
+  ) &
+  if (($index % $gpu_num == 0)); then wait; fi
+done
diff --git a/WizardCoder/humaneval_4_a100.sh b/WizardCoder/humaneval_4_a100.sh
@@ -0,0 +1,21 @@
+#########################################################################
+# File Name: humaneval_2_a100.sh
+# Author: Xianchao Wu
+# mail: [email protected]
+# Created Time: Thu Aug 17 23:05:32 2023
+#########################################################################
+#!/bin/bash
+
+temp=0.0
+pred_num=1
+
+#output_path=preds/humaneval_T${temp}_N${pred_num}
+
+output_path=preds/T${temp}_N${pred_num}_WizardCoder_Greedy_Decode
+
+echo 'Output path: '$output_path
+python src/process_humaneval.py --path ${output_path} \
+	--out_path ${output_path}.jsonl \
+	--add_prompt
+
+evaluate_functional_correctness ${output_path}.jsonl
diff --git a/WizardCoder/src/process_humaneval.py b/WizardCoder/src/process_humaneval.py
@@ -1,3 +1,7 @@
+import sys
+
+sys.path.append('/workspace/asr/WizardLM/WizardCoder/human-eval')
+
 from human_eval.data import read_problems, write_jsonl, stream_jsonl
 import glob 
 from tqdm import tqdm
@@ -66,4 +70,4 @@
 
 print("save to {}".format(args.out_path))
 write_jsonl(args.out_path, output)
-print(a)
+print(a)
diff --git a/WizardCoder/src/train_wizardcoder.py b/WizardCoder/src/train_wizardcoder.py
@@ -26,6 +26,7 @@
 from datasets import load_dataset
 import utils
 
+import ipdb; ipdb.set_trace()
 IGNORE_INDEX = -100
 DEFAULT_PAD_TOKEN = "[PAD]"
 DEFAULT_EOS_TOKEN = "<|endoftext|>"
@@ -43,7 +44,7 @@
         "### Instruction:\n{instruction}\n\n### Response:"
     ),
 }
-
+# {'prompt_input': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:', 'prompt_no_input': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:'} NOTE
 
 @dataclass
 class ModelArguments:
@@ -96,9 +97,9 @@ def smart_tokenizer_and_embedding_resize(
         input_embeddings[-num_new_tokens:] = input_embeddings_avg
         output_embeddings[-num_new_tokens:] = output_embeddings_avg
 
-
 def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
     """Tokenize a list of strings."""
+    #import ipdb; ipdb.set_trace()
     tokenized_list = [
         tokenizer(
             text,
@@ -112,12 +113,12 @@ def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedToken
     input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
     input_ids_lens = labels_lens = [
         tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
-    ]
+    ] # tokenizer.pad_token_id = 49152
     return dict(
-        input_ids=input_ids,
-        labels=labels,
-        input_ids_lens=input_ids_lens,
-        labels_lens=labels_lens,
+        input_ids=input_ids, # [tensor([27400,   438,   600, 12404,   688, 18872,   312,  2899,    32,  5950, ... ])]
+        labels=labels, # [tensor([27400,   438,   600, 12404,   688, 18872,   312,  2899,    32,  5950, ...])], 
+        input_ids_lens=input_ids_lens, # [68]
+        labels_lens=labels_lens, # [68] NOTE
     )
 
 
@@ -127,15 +128,16 @@ def preprocess(
     tokenizer: transformers.PreTrainedTokenizer,
 ) -> Dict:
     """Preprocess the data by tokenizing."""
-    examples = [s + t for s, t in zip(sources, targets)]
+    #import ipdb; ipdb.set_trace()
+    examples = [s + t for s, t in zip(sources, targets)] # ['Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate an array of length 5 which contains all even numbers between 1 and 10.\n\n### Response:arr = [2, 4, 6, 8, 10]<|endoftext|>']
     examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
     input_ids = examples_tokenized["input_ids"]
     labels = copy.deepcopy(input_ids)
     for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
-        label[:source_len] = IGNORE_INDEX
-    return dict(input_ids=input_ids, labels=labels)
-
-
+        label[:source_len] = IGNORE_INDEX # label[:49] = -100, 前面的49个tokens是pre-given condition，不是参与到label (target)中去. NOTE 这里是直接修改了label，连动着，labels也被修改了 NOTE 这个非常重要.
+    return dict(input_ids=input_ids, labels=labels) #  
+    # input_ids = [tensor([27400,   438,   600, 12404,   688, 18872,   312,  2899,    32,  5950, ... ])]
+    # labels = [tensor([-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, ...         -100,  846,  280,  428,   36,   30,  225,   38,   30,  225,   40,   30,          225,   42,   30,  225,   35,   34,   79,    0])], 一共是49个-100，省下的是真正的labels，一共长度是：68 - 49 = 19 个tokens
 @dataclass
 class DataCollatorForSupervisedDataset(object):
     """Collate examples for supervised fine-tuning."""
@@ -156,70 +158,75 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
             attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
         )
 
-def train_tokenize_function(examples, tokenizer):
+def train_tokenize_function(examples, tokenizer): # examples = {'input': [''], 'output': ['arr = [2, 4, 6, 8, 10]'], 'instruction': ['Create an array of length 5 which contains all even numbers between 1 and 10.']}; 
+    #import ipdb; ipdb.set_trace()
     prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
     if 'input' in examples:
         sources = [
             prompt_input.format_map(dict(instruction=instruction, input=input)) if input != "" \
             else prompt_no_input.format_map(dict(instruction=instruction)) \
             for instruction, input in zip(examples['instruction'], examples['input']) 
-        ]
+        ] # sources = ['Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate an array of length 5 which contains all even numbers between 1 and 10.\n\n### Response:']
     else:
         sources = [
             prompt_no_input.format_map(dict(instruction=instruction)) \
             for instruction in examples['instruction']
-        ]
-    targets = [f"{output}{tokenizer.eos_token}" for output in examples['output']]
+        ] 
+    targets = [f"{output}{tokenizer.eos_token}" for output in examples['output']] # ['arr = [2, 4, 6, 8, 10]<|endoftext|>']
     data_dict = preprocess(sources, targets, tokenizer)
-    return data_dict
-
+    return data_dict # {'input_ids': [tensor], 'labels': [tensor]}
 
 def train():
+    import ipdb; ipdb.set_trace()
     parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
+    #cache_dir="/workspace/asr/WizardLM/WizardCoder"    
+    import ipdb; ipdb.set_trace()
     model = transformers.AutoModelForCausalLM.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=training_args.cache_dir,
+        use_auth_token=True,
     )
-
+    import ipdb; ipdb.set_trace()
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=training_args.cache_dir,
-        model_max_length=training_args.model_max_length,
+        model_max_length=training_args.model_max_length, # 2048
         padding_side="right",
         use_fast=True,
-    )
+        use_auth_token=True,
+    ) # GPT2TokenizerFast(name_or_path='bigcode/starcoder', vocab_size=49152, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<fim_prefix>', '<fim_middle>', '<fim_suffix>', '<fim_pad>', '<filename>', '<gh_stars>', '<issue_start>', '<issue_comment>', '<issue_closed>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<empty_output>', '<commit_before>', '<commit_msg>', '<commit_after>', '<reponame>']}, clean_up_tokenization_spaces=True)
     if tokenizer.pad_token is None:
         smart_tokenizer_and_embedding_resize(
-            special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
+            special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN), # '[PAD]' -> {'pad_token': '[PAD]'}
             tokenizer=tokenizer,
-            model=model,
+            model=model, # <class 'transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM'><class 'transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM'>
         )
     if "starcoder" in model_args.model_name_or_path:
         tokenizer.add_special_tokens(
             {
-                "eos_token": DEFAULT_EOS_TOKEN,
-                "bos_token": DEFAULT_BOS_TOKEN,
-                "unk_token": DEFAULT_UNK_TOKEN,
-                "pad_token": DEFAULT_PAD_TOKEN,
+                "eos_token": DEFAULT_EOS_TOKEN, # '<|endoftext|>'
+                "bos_token": DEFAULT_BOS_TOKEN, # '<|endoftext|>'
+                "unk_token": DEFAULT_UNK_TOKEN, # '<|endoftext|>'
+                "pad_token": DEFAULT_PAD_TOKEN, # '[PAD]'
             }
         )
 
-    raw_train_datasets = load_dataset('json', data_files=data_args.data_path, split="train", cache_dir=training_args.cache_dir)
-    if training_args.local_rank > 0: 
+        raw_train_datasets = load_dataset('json', data_files=data_args.data_path, split="train", cache_dir=training_args.cache_dir) # '/workspace/asr/WizardLM/WizardCoder/data/code_alpaca_20k.json'; Dataset({features: ['input', 'output', 'instruction'], num_rows: 20022}) NOTE
+    if training_args.local_rank > 0: # = 0 NOTE
         torch.distributed.barrier()
-
-    train_dataset = raw_train_datasets.map(
+    # <class 'datasets.arrow_dataset.Dataset'> = type(row_train_datasets)
+    train_dataset = raw_train_datasets.map( # NOTE 这个很赞啊！
         train_tokenize_function,
         batched=True,
-        batch_size=3000,
-        num_proc=32,
-        remove_columns=raw_train_datasets.column_names,
+        batch_size=1, # 3000
+        num_proc=1, # TODO 32
+        remove_columns=raw_train_datasets.column_names, # ['input', 'output', 'instruction']
         load_from_cache_file=True, # not args.overwrite_cache
         desc="Running tokenizer on train dataset",
         fn_kwargs={"tokenizer": tokenizer}
     )
+    import ipdb; ipdb.set_trace()
 
     if training_args.local_rank == 0:
         torch.distributed.barrier()
@@ -245,4 +252,4 @@ def train():
 
 
 if __name__ == "__main__":
-    train()
+    train()
diff --git a/WizardCoder/trainwcoder.sh b/WizardCoder/trainwcoder.sh
@@ -0,0 +1,38 @@
+#########################################################################
+# File Name: trainwcoder.sh
+# Author: Xianchao Wu
+# mail: [email protected]
+# Created Time: Fri Aug 18 08:03:48 2023
+#########################################################################
+#!/bin/bash
+
+#!/bin/bash
+
+data="/workspace/asr/WizardLM/WizardCoder/data/code_alpaca_20k.json"
+#outdir="/workspace/asr/Llama-X/src/checkpoints_wcode"
+outdir="/workspace/asr/WizardLM/WizardCoder/ckpts"
+
+#deepspeed src/train_wizardcoder.py \
+python -m ipdb src/train_wizardcoder.py \
+    --model_name_or_path "bigcode/starcoder" \
+    --data_path $data \
+    --output_dir $outdir  \
+	--cache_dir "/workspace/asr/WizardLM/WizardCoder" \
+    --num_train_epochs 3 \
+    --model_max_length 2048 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50 \
+    --save_total_limit 2 \
+    --learning_rate 2e-5 \
+    --warmup_steps 30 \
+    --logging_steps 2 \
+    --lr_scheduler_type "cosine" \
+    --report_to "tensorboard" \
+    --gradient_checkpointing True \
+    --deepspeed configs/deepspeed_config.json \
+    --fp16 True
+