From a8efc514945de1ccae46321846ccb21c0630da57 Mon Sep 17 00:00:00 2001 From: itayhubara Date: Thu, 29 Feb 2024 12:54:57 +0200 Subject: [PATCH] editing logging to resolve all checker issues --- llama2_70b_lora/run_llama_70B_scrolls_r16.sh | 9 +++---- .../scripts/mlperf_logging_utils.py | 25 +++++++++++++------ llama2_70b_lora/scripts/train.py | 6 ++--- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/llama2_70b_lora/run_llama_70B_scrolls_r16.sh b/llama2_70b_lora/run_llama_70B_scrolls_r16.sh index 759b9aeb4..2b5377b91 100644 --- a/llama2_70b_lora/run_llama_70B_scrolls_r16.sh +++ b/llama2_70b_lora/run_llama_70B_scrolls_r16.sh @@ -1,16 +1,13 @@ accelerate launch --config_file configs/default_config.yaml scripts/train.py \ ---model_name meta-llama/Llama-2-70b-hf \ --dataset_path "./dataset" \ ---model_path "./llama-v2-fused-qkv" \ +--model_path "/software/users/ihubara/lora_clean/llama-v2-fused-qkv" \ --max_seq_len 8192 \ --bf16 True \ ---logging_steps 2 \ ---eval_steps 6 \ ---save_steps 999 \ +--logging_steps 32 \ +--eval_steps 64 \ --output_dir "./results/llama-70b_scrolls_gov_report_r16_$1" \ --per_device_train_batch_size 1 \ --gradient_accumulation_steps 1 \ ---dataset_text_field "input" \ --lr_scheduler_type "cosine" \ --learning_rate 5e-4 \ --warmup_ratio 0 \ diff --git a/llama2_70b_lora/scripts/mlperf_logging_utils.py b/llama2_70b_lora/scripts/mlperf_logging_utils.py index c11c0ce2c..cda18df41 100644 --- a/llama2_70b_lora/scripts/mlperf_logging_utils.py +++ b/llama2_70b_lora/scripts/mlperf_logging_utils.py @@ -73,22 +73,27 @@ def end(self, key, value=None, metadata=None, sync=False, log_rank=None): class MLPerfCallback(TrainerCallback): "A callback that prints a message at the beginning of training" - def __init__(self, logger, train_dataset_length, eval_dataset_length): + def __init__(self, logger, train_dataset_length, eval_dataset_length,lora_alpha): super().__init__() self.mllogger = logger self.submission_info = { "submission_benchmark": "llama2_70b_lora", - "submission_division": "Closed", + "submission_division": "closed", "submission_org": "referece", "submission_platform": "referece", "submission_poc_name": "referece", "submission_poc_email": "referece", - "submission_status": "referece", + "submission_status": "onprem", "train_dataset_length": train_dataset_length, "eval_dataset_length": eval_dataset_length, + "lora_alpha": lora_alpha } def on_train_begin(self, args, state, control, **kwargs): + self.gbs=args.per_device_train_batch_size * args.gradient_accumulation_steps * os.getenv("WORLD_SIZE", 1) + self.mllogger.event( + key=constants.CACHE_CLEAR, value="True", + ) self.mllogger.event( key=constants.SUBMISSION_BENCHMARK, value=self.submission_info["submission_benchmark"], @@ -133,9 +138,15 @@ def on_train_begin(self, args, state, control, **kwargs): self.mllogger.event(key=constants.SEED, value=args.seed) self.mllogger.event(key=constants.OPT_LR_WARMUP_FACTOR, value=args.warmup_ratio) self.mllogger.event(key=constants.OPT_LR_TRAINING_STEPS, value=args.max_steps) + self.mllogger.event(key=constants.OPT_ADAMW_WEIGHT_DECAY, value=args.weight_decay) + self.mllogger.event(key=constants.OPT_GRADIENT_CLIP_NORM, value=args.max_grad_norm) self.mllogger.event(key=constants.OPT_BASE_LR, value=args.learning_rate) - self.mllogger.event(key=constants.LORA_ALPHA, value=args.lora_alpha) + self.mllogger.event(key=constants.LORA_ALPHA, value=self.submission_info["lora_alpha"]) + self.mllogger.event(key='lora_rank', value=16) self.mllogger.event(key=constants.GRADIENT_ACCUMULATION_STEPS, value=args.gradient_accumulation_steps) + self.mllogger.start(key=constants.INIT_START, value="") + # device warmup should be done here + self.mllogger.end(key=constants.INIT_STOP, value="") self.mllogger.start(constants.RUN_START, value="") def on_step_begin( @@ -168,9 +179,9 @@ def on_step_begin( metadata={"step_num": state.log_history[-1]["step"]}, ) self.mllogger.event( - "eval_loss", + constants.EVAL_ACCURACY, value=state.log_history[-1]["eval_loss"], - metadata={"step_num": state.log_history[-1]["step"]}, + metadata={"samples_num": state.log_history[-1]["step"]*self.gbs}, ) self.mllogger.start( constants.BLOCK_START, @@ -187,7 +198,7 @@ def on_step_begin( constants.RUN_STOP, value=eval_loss_list[-1], metadata={ - "step_num": state.log_history[-1]["step"], + "samples_num": state.log_history[-1]["step"]*self.gbs, "status": "success", }, ) diff --git a/llama2_70b_lora/scripts/train.py b/llama2_70b_lora/scripts/train.py index 01aa346c1..afe09912e 100644 --- a/llama2_70b_lora/scripts/train.py +++ b/llama2_70b_lora/scripts/train.py @@ -38,8 +38,8 @@ class ScriptArguments: max_grad_norm: Optional[float] = field(default=0.0) weight_decay: Optional[float] = field(default=0.001) lora_alpha: Optional[int] = field(default=32) - lora_dropout: Optional[float] = field(default=0.1, metadata={"lora dropout is a fixed to 0.1 in closed submission"}) - lora_r: Optional[int] = field(default=16, metadata={"lora rank is a fixed to 16 in closed submission"}) + lora_dropout: Optional[float] = field(default=0.1, metadata={"help": "lora dropout is a fixed to 0.1 in closed submission"}) + lora_r: Optional[int] = field(default=16, metadata={"help": "lora rank is a fixed to 16 in closed submission"}) lora_target_modules: Optional[str] = field( default=None, metadata={ @@ -185,7 +185,7 @@ def main(args): args=training_arguments, train_dataset=train_dataset, eval_dataset=eval_dataset, - callbacks=[MLPerfCallback(loralogger, len(train_dataset), len(eval_dataset))], + callbacks=[MLPerfCallback(loralogger, len(train_dataset), len(eval_dataset),args.lora_alpha)], ) trainer.accelerator.print(f"{trainer.model}") if args.use_peft_lora: