diff --git a/WizardCoder/src/download_starcoder.py b/WizardCoder/src/download_starcoder.py new file mode 100644 index 0000000..b09c052 --- /dev/null +++ b/WizardCoder/src/download_starcoder.py @@ -0,0 +1,15 @@ +model="bigcode/starcoder" + +cache_dir="/workspace/asr/WizardLM/WizardCoder" + +import transformers + +model = transformers.AutoModelForCausalLM.from_pretrained( + model, + cache_dir=cache_dir, + use_auth_token=True +) + +print(model) + +print(sum(p.numel() for p in model.parameters())) diff --git a/WizardCoder/src/train_wizardcoder.py b/WizardCoder/src/train_wizardcoder.py index 04bbdc2..fd8bc0d 100644 --- a/WizardCoder/src/train_wizardcoder.py +++ b/WizardCoder/src/train_wizardcoder.py @@ -236,7 +236,7 @@ def train(): for index in random.sample(range(len(train_dataset)), 3): print(f"Sample {index} of the training set: {train_dataset[index]}.") - data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) + data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) # DataCollatorForSupervisedDataset(tokenizer=GPT2TokenizerFast(name_or_path='bigcode/starcoder', vocab_size=49152, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '[PAD]', 'additional_special_tokens': ['<|endoftext|>', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']}, clean_up_tokenization_spaces=True)) data_module = dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator) #Tell Trainer not to attempt DataParallel