Skip to content

Commit

Permalink
iter
Browse files Browse the repository at this point in the history
  • Loading branch information
MikhailKardash committed Oct 23, 2024
1 parent 1e654f6 commit cb71a74
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 52 deletions.
97 changes: 48 additions & 49 deletions harness/determined/exec/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,55 +210,54 @@ def _run_deepspeed_trial(

logger.debug("Starting harness.")

with maybe_periodic_stacktraces(info.trial._debug):
with det_ds.init(
hparams=info.trial.hparams,
exp_conf=info.trial._config,
) as train_context:
fp16_compression = bool(info.trial._config["optimizations"]["gradient_compression"])
average_aggregated_gradients = bool(
info.trial._config["optimizations"]["average_aggregated_gradients"]
)

train_context._set_default_gradient_compression(fp16_compression)
train_context._set_default_average_aggregated_gradients(average_aggregated_gradients)
train_context._set_is_pre_trainer()

trial_inst = trial_class(train_context)

if train_context.distributed.size > 1 and not train_context.distributed.rank == 0:
log_level = logging.DEBUG if info.trial._debug else logging.WARNING
logging.getLogger().setLevel(log_level)

logger.info(
f"Creating {det_ds.DeepSpeedTrialController.__name__} with {trial_class.__name__}."
)

trainer = det_ds.Trainer(trial_inst, train_context)

if "global_batch_size" in info.trial.hparams:
global_batch_size = int(
info.trial.hparams["global_batch_size"]
) # type: Optional[int]
else:
global_batch_size = None

trainer.fit(
checkpoint_period=pytorch.TrainUnit._from_values(
**info.trial._config["min_checkpoint_period"],
global_batch_size=global_batch_size,
),
validation_period=pytorch.TrainUnit._from_values(
**info.trial._config["min_validation_period"],
global_batch_size=global_batch_size,
),
reporting_period=pytorch.Batch(info.trial._config["scheduling_unit"]),
checkpoint_policy=info.trial._config["checkpoint_policy"],
latest_checkpoint=info.latest_checkpoint,
step_zero_validation=info.trial._config["perform_initial_validation"],
test_mode=False,
profiling_enabled=bool(info.trial._config["profiling"]["enabled"]),
)
with det_ds.init(
hparams=info.trial.hparams,
exp_conf=info.trial._config,
) as train_context:
fp16_compression = bool(info.trial._config["optimizations"]["gradient_compression"])
average_aggregated_gradients = bool(
info.trial._config["optimizations"]["average_aggregated_gradients"]
)

train_context._set_default_gradient_compression(fp16_compression)
train_context._set_default_average_aggregated_gradients(average_aggregated_gradients)
train_context._set_is_pre_trainer()

trial_inst = trial_class(train_context)

if train_context.distributed.size > 1 and not train_context.distributed.rank == 0:
log_level = logging.DEBUG if info.trial._debug else logging.WARNING
logging.getLogger().setLevel(log_level)

logger.info(
f"Creating {det_ds.DeepSpeedTrialController.__name__} with {trial_class.__name__}."
)

trainer = det_ds.Trainer(trial_inst, train_context)

if "global_batch_size" in info.trial.hparams:
global_batch_size = int(
info.trial.hparams["global_batch_size"]
) # type: Optional[int]
else:
global_batch_size = None

trainer.fit(
checkpoint_period=pytorch.TrainUnit._from_values(
**info.trial._config["min_checkpoint_period"],
global_batch_size=global_batch_size,
),
validation_period=pytorch.TrainUnit._from_values(
**info.trial._config["min_validation_period"],
global_batch_size=global_batch_size,
),
reporting_period=pytorch.Batch(info.trial._config["scheduling_unit"]),
checkpoint_policy=info.trial._config["checkpoint_policy"],
latest_checkpoint=info.latest_checkpoint,
step_zero_validation=info.trial._config["perform_initial_validation"],
test_mode=False,
profiling_enabled=bool(info.trial._config["profiling"]["enabled"]),
)

return 0

Expand Down
3 changes: 0 additions & 3 deletions harness/determined/pytorch/deepspeed/_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,6 @@ def init(
if local_training:
trial_seed = None
steps_completed = 0
debug_enabled = False
num_gpus = len(gpu.get_gpu_uuids())
else:
assert cluster_info, "Unable to detect cluster info"
Expand All @@ -245,7 +244,6 @@ def init(
exp_conf = cluster_info.trial._config
steps_completed = cluster_info.trial._steps_completed
num_gpus = len(cluster_info.gpu_uuids)
debug_enabled = cluster_info.trial._debug

_set_random_seeds(trial_seed)

Expand All @@ -262,7 +260,6 @@ def init(
num_gpus=num_gpus,
exp_conf=exp_conf,
steps_completed=steps_completed,
debug_enabled=debug_enabled,
enable_tensorboard_logging=enable_tensorboard_logging,
)

Expand Down

0 comments on commit cb71a74

Please sign in to comment.