From abfedb94cc96ac3901cb3cfaa44d0c29c30bd8b8 Mon Sep 17 00:00:00 2001 From: zhangsmallshark Date: Mon, 10 Feb 2025 10:32:32 -0600 Subject: [PATCH] save args Signed-off-by: zhangsmallshark --- training/DeepSpeed-Domino/domino/arguments.py | 2 +- training/DeepSpeed-Domino/megatron/checkpointing.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/training/DeepSpeed-Domino/domino/arguments.py b/training/DeepSpeed-Domino/domino/arguments.py index 0faf58d93..6f238fd6f 100644 --- a/training/DeepSpeed-Domino/domino/arguments.py +++ b/training/DeepSpeed-Domino/domino/arguments.py @@ -218,7 +218,7 @@ def parse_args(): help='Do not load optimizer when loading checkpoint.') parser.add_argument('--no-load-rng', action='store_true', default=None, help='Do not load rng state when loading checkpoint.') - group.add_argument('--exit-on-missing-checkpoint', action='store_true', + parser.add_argument('--exit-on-missing-checkpoint', action='store_true', help="If '--load' is set, but checkpoint is not found " "(e.g., path typo), then exit instead of random " "initialization.") diff --git a/training/DeepSpeed-Domino/megatron/checkpointing.py b/training/DeepSpeed-Domino/megatron/checkpointing.py index 19aef958f..0d26d7967 100644 --- a/training/DeepSpeed-Domino/megatron/checkpointing.py +++ b/training/DeepSpeed-Domino/megatron/checkpointing.py @@ -247,7 +247,10 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler): # Arguments, iteration, and model. state_dict = {} - # state_dict['args'] = args + t_args = args + t_args.init_method = None + t_args.output_layer_init_method = None + state_dict['args'] = t_args state_dict['checkpoint_version'] = 3.0 state_dict['iteration'] = iteration if len(model) == 1: