minors

mlcommons · Feb 28, 2024 · 3afa89d · 3afa89d
1 parent 7724b34
commit 3afa89d
Show file tree

Hide file tree

Showing 3 changed files with 4 additions and 5 deletions.
diff --git a/graph_neural_network/README.md b/graph_neural_network/README.md
@@ -102,7 +102,7 @@ The number of partitions and number of training nodes must be the same. In each
 CUDA_VISIBLE_DEVICES=0,1 python dist_train_rgnn.py --num_nodes=2 --node_rank=0 --num_training_procs=2 --master_addr=master_address_ip --model='rgat' --dataset_size='full' --layout='CSC'
 
 # node 1:
-CUDA_VISIBLE_DEVICES=0,1 python dist_train_rgnn.py --num_nodes=2 --node_rank=1 --num_training_procs=2 --master_addr=master_address_ip --model='rgat' --dataset_size='full' --layout='CSC'
+CUDA_VISIBLE_DEVICES=2,3 python dist_train_rgnn.py --num_nodes=2 --node_rank=1 --num_training_procs=2 --master_addr=master_address_ip --model='rgat' --dataset_size='full' --layout='CSC'
 ```
 The above script assumes that the training nodes are equipped with 2 GPUs and the number of training processes is equal to the number of GPUs. Each training process has a corresponding
 sampling process using the same GPU. 
@@ -117,8 +117,7 @@ The `master_address_ip` should be replaced with the actual IP address of the mas
 CUDA_VISIBLE_DEVICES=0,1 python dist_train_rgnn.py --num_nodes=2 --node_rank=0 --num_training_procs=1 --master_addr=localhost --model='rgat' --dataset_size='full' --layout='CSC' --split_training_sampling
 
 # node 1:
-CUDA_VISIBLE_DEVICES=2,3 python dist_train_rgnn.py --num_nodes=2 --node_rank=1 --num_training_procs=1 --master_addr=localhost --model='rgat' --dataset_size='full'
---layout='CSC' --split_training_sampling
+CUDA_VISIBLE_DEVICES=2,3 python dist_train_rgnn.py --num_nodes=2 --node_rank=1 --num_training_procs=1 --master_addr=localhost --model='rgat' --dataset_size='full' --layout='CSC' --split_training_sampling
 ```
 The above script uses one GPU for training and another for sampling in each node.
 

diff --git a/graph_neural_network/dist_train_rgnn.py b/graph_neural_network/dist_train_rgnn.py
@@ -63,7 +63,7 @@ def evaluate(model, dataloader, current_device, use_fp16, with_gpu,
           key=mllog_constants.EVAL_STOP,
           metadata={mllog_constants.EPOCH_NUM: epoch_num},
       )
-    return acc.item(), global_acc
+    return acc, global_acc
 
 def run_training_proc(local_proc_rank, num_nodes, node_rank, num_training_procs,
     split_training_sampling, hidden_channels, num_classes, num_layers, 

diff --git a/graph_neural_network/train_rgnn_multi_gpu.py b/graph_neural_network/train_rgnn_multi_gpu.py
@@ -61,7 +61,7 @@ def evaluate(model, dataloader, current_device, rank, world_size, epoch_num):
           key=mllog_constants.EVAL_STOP,
           metadata={mllog_constants.EPOCH_NUM: epoch_num},
       )
-    return acc.item(), global_acc
+    return acc, global_acc
 
 def run_training_proc(rank, world_size,
     hidden_channels, num_classes, num_layers, model_type, num_heads, fan_out,