diff --git a/harness/determined/exec/harness.py b/harness/determined/exec/harness.py index ce15fadc788..92fafc50c70 100644 --- a/harness/determined/exec/harness.py +++ b/harness/determined/exec/harness.py @@ -210,55 +210,54 @@ def _run_deepspeed_trial( logger.debug("Starting harness.") - with maybe_periodic_stacktraces(info.trial._debug): - with det_ds.init( - hparams=info.trial.hparams, - exp_conf=info.trial._config, - ) as train_context: - fp16_compression = bool(info.trial._config["optimizations"]["gradient_compression"]) - average_aggregated_gradients = bool( - info.trial._config["optimizations"]["average_aggregated_gradients"] - ) - - train_context._set_default_gradient_compression(fp16_compression) - train_context._set_default_average_aggregated_gradients(average_aggregated_gradients) - train_context._set_is_pre_trainer() - - trial_inst = trial_class(train_context) - - if train_context.distributed.size > 1 and not train_context.distributed.rank == 0: - log_level = logging.DEBUG if info.trial._debug else logging.WARNING - logging.getLogger().setLevel(log_level) - - logger.info( - f"Creating {det_ds.DeepSpeedTrialController.__name__} with {trial_class.__name__}." - ) - - trainer = det_ds.Trainer(trial_inst, train_context) - - if "global_batch_size" in info.trial.hparams: - global_batch_size = int( - info.trial.hparams["global_batch_size"] - ) # type: Optional[int] - else: - global_batch_size = None - - trainer.fit( - checkpoint_period=pytorch.TrainUnit._from_values( - **info.trial._config["min_checkpoint_period"], - global_batch_size=global_batch_size, - ), - validation_period=pytorch.TrainUnit._from_values( - **info.trial._config["min_validation_period"], - global_batch_size=global_batch_size, - ), - reporting_period=pytorch.Batch(info.trial._config["scheduling_unit"]), - checkpoint_policy=info.trial._config["checkpoint_policy"], - latest_checkpoint=info.latest_checkpoint, - step_zero_validation=info.trial._config["perform_initial_validation"], - test_mode=False, - profiling_enabled=bool(info.trial._config["profiling"]["enabled"]), - ) + with det_ds.init( + hparams=info.trial.hparams, + exp_conf=info.trial._config, + ) as train_context: + fp16_compression = bool(info.trial._config["optimizations"]["gradient_compression"]) + average_aggregated_gradients = bool( + info.trial._config["optimizations"]["average_aggregated_gradients"] + ) + + train_context._set_default_gradient_compression(fp16_compression) + train_context._set_default_average_aggregated_gradients(average_aggregated_gradients) + train_context._set_is_pre_trainer() + + trial_inst = trial_class(train_context) + + if train_context.distributed.size > 1 and not train_context.distributed.rank == 0: + log_level = logging.DEBUG if info.trial._debug else logging.WARNING + logging.getLogger().setLevel(log_level) + + logger.info( + f"Creating {det_ds.DeepSpeedTrialController.__name__} with {trial_class.__name__}." + ) + + trainer = det_ds.Trainer(trial_inst, train_context) + + if "global_batch_size" in info.trial.hparams: + global_batch_size = int( + info.trial.hparams["global_batch_size"] + ) # type: Optional[int] + else: + global_batch_size = None + + trainer.fit( + checkpoint_period=pytorch.TrainUnit._from_values( + **info.trial._config["min_checkpoint_period"], + global_batch_size=global_batch_size, + ), + validation_period=pytorch.TrainUnit._from_values( + **info.trial._config["min_validation_period"], + global_batch_size=global_batch_size, + ), + reporting_period=pytorch.Batch(info.trial._config["scheduling_unit"]), + checkpoint_policy=info.trial._config["checkpoint_policy"], + latest_checkpoint=info.latest_checkpoint, + step_zero_validation=info.trial._config["perform_initial_validation"], + test_mode=False, + profiling_enabled=bool(info.trial._config["profiling"]["enabled"]), + ) return 0 diff --git a/harness/determined/pytorch/deepspeed/_trainer.py b/harness/determined/pytorch/deepspeed/_trainer.py index 55df67fe452..4f9ecd37581 100644 --- a/harness/determined/pytorch/deepspeed/_trainer.py +++ b/harness/determined/pytorch/deepspeed/_trainer.py @@ -236,7 +236,6 @@ def init( if local_training: trial_seed = None steps_completed = 0 - debug_enabled = False num_gpus = len(gpu.get_gpu_uuids()) else: assert cluster_info, "Unable to detect cluster info" @@ -245,7 +244,6 @@ def init( exp_conf = cluster_info.trial._config steps_completed = cluster_info.trial._steps_completed num_gpus = len(cluster_info.gpu_uuids) - debug_enabled = cluster_info.trial._debug _set_random_seeds(trial_seed) @@ -262,7 +260,6 @@ def init( num_gpus=num_gpus, exp_conf=exp_conf, steps_completed=steps_completed, - debug_enabled=debug_enabled, enable_tensorboard_logging=enable_tensorboard_logging, )