Skip to content

Commit

Permalink
Round up epoch_num, add GRADIENT_ACCUMULATION_STEPS and OPT_NAME into…
Browse files Browse the repository at this point in the history
… log outputs

Committed-by: LiSu from Dev container
  • Loading branch information
LiSu committed Mar 14, 2024
1 parent 3afa89d commit 8ed4fc8
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 4 deletions.
7 changes: 5 additions & 2 deletions graph_neural_network/dist_train_rgnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def run_training_proc(local_proc_rank, num_nodes, node_rank, num_training_procs,
torch.cuda.synchronize()
torch.distributed.barrier()
if rank == 0:
epoch_num = epoch + idx / batch_num
epoch_num = round((epoch + idx / batch_num), 2)
glt.utils.common.save_ckpt(idx + epoch * batch_num,
ckpt_dir, model.module, optimizer, epoch_num)
torch.distributed.barrier()
Expand All @@ -266,7 +266,7 @@ def run_training_proc(local_proc_rank, num_nodes, node_rank, num_training_procs,
if with_gpu:
torch.cuda.synchronize()
torch.distributed.barrier()
epoch_num = epoch + idx / batch_num
epoch_num = round((epoch + idx / batch_num), 2)
model.eval()
rank_val_acc, global_acc = evaluate(model, val_loader, current_device,
use_fp16, with_gpu, rank,
Expand Down Expand Up @@ -415,7 +415,10 @@ def run_training_proc(local_proc_rank, num_nodes, node_rank, num_training_procs,
if args.node_rank == 0:
world_size = args.num_nodes * args.num_training_procs
submission_info(mllogger, 'GNN', 'reference_implementation')

mllogger.event(key=mllog_constants.GLOBAL_BATCH_SIZE, value=world_size*args.train_batch_size)
mllogger.event(key=mllog_constants.GRADIENT_ACCUMULATION_STEPS, value=1)
mllogger.event(key=mllog_constants.OPT_NAME, value='Adam')
mllogger.event(key=mllog_constants.OPT_BASE_LR, value=args.learning_rate)
mllogger.event(key=mllog_constants.SEED,value=args.random_seed)
mllogger.end(key=mllog_constants.INIT_STOP)
Expand Down
6 changes: 4 additions & 2 deletions graph_neural_network/train_rgnn_multi_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def run_training_proc(rank, world_size,
torch.cuda.synchronize()
dist.barrier()
if rank == 0:
epoch_num = epoch + idx / batch_num
epoch_num = round((epoch + idx / batch_num), 2)
glt.utils.common.save_ckpt(idx + epoch * batch_num,
ckpt_dir, model.module, optimizer, epoch_num)
dist.barrier()
Expand All @@ -195,7 +195,7 @@ def run_training_proc(rank, world_size,
if with_gpu:
torch.cuda.synchronize()
dist.barrier()
epoch_num = epoch + idx / batch_num
epoch_num = round((epoch + idx / batch_num), 2)
model.eval()
rank_val_acc, global_acc = evaluate(model, val_loader, current_device,
rank, world_size, epoch_num)
Expand Down Expand Up @@ -313,6 +313,8 @@ def run_training_proc(rank, world_size,
world_size = torch.cuda.device_count()
submission_info(mllogger, 'GNN', 'reference_implementation')
mllogger.event(key=mllog_constants.GLOBAL_BATCH_SIZE, value=world_size*args.train_batch_size)
mllogger.event(key=mllog_constants.GRADIENT_ACCUMULATION_STEPS, value=1)
mllogger.event(key=mllog_constants.OPT_NAME, value='Adam')
mllogger.event(key=mllog_constants.OPT_BASE_LR, value=args.learning_rate)
mllogger.event(key=mllog_constants.SEED,value=args.random_seed)
mllogger.end(key=mllog_constants.INIT_STOP)
Expand Down

0 comments on commit 8ed4fc8

Please sign in to comment.