Update to use torchrun in Multi-GPU SSL Training (#1663)

Fixes #1662 ### Description A few sentences describing the changes proposed in this pull request. ### Checks  - [ ] Avoid including large-size files in the PR. - [ ] Clean up long text outputs from code cells in the notebook. - [ ] For security purposes, please check the contents and remove any sensitive info such as user names and private key. - [ ] Ensure (1) hyperlinks and markdown anchors are working (2) use relative paths for tutorial repo files (3) put figure and graphs in the `./figure` folder - [ ] Notebook runs automatically `./runner.sh -t <path to .ipynb file>` --------- Signed-off-by: YunLiu <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Project-MONAI · Mar 21, 2024 · 5d6f600 · 5d6f600
1 parent 34951f3
commit 5d6f600
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 9 deletions.
diff --git a/self_supervised_pretraining/vit_unetr_ssl/README.md b/self_supervised_pretraining/vit_unetr_ssl/README.md
@@ -144,7 +144,7 @@ At the time of creation of this tutorial, the below additional dependencies are
 To begin training with 2 GPU's please see the below example command for execution of the SSL multi-gpu training
 script:
 
-`CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 mgpu_ssl_train.py --batch_size=8 --epochs=500 --base_lr=2e-4 --logdir_path=/to/be/defined --output=/to/be/defined --data_root=/to/be/defined --json_path=/to/be/defined`
+`CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 mgpu_ssl_train.py --batch_size=8 --epochs=500 --base_lr=2e-4 --logdir_path=/to/be/defined --output=/to/be/defined --data_root=/to/be/defined --json_path=/to/be/defined`
 
 It can be configured to launch on more GPU's by adding the relevant `CUDA Device` ID in `CUDA_VISIBLE_DEVICES`
 and increasing the total count of GPU's `--nproc_per_node`

diff --git a/self_supervised_pretraining/vit_unetr_ssl/multi_gpu/mgpu_ssl_train.py b/self_supervised_pretraining/vit_unetr_ssl/multi_gpu/mgpu_ssl_train.py
@@ -67,8 +67,6 @@ def parse_option():
         metavar="PATH",
         help="root of output folder, the full path is <output>/<model_name>/<tag> (default: output)",
     )
-    # Distributed Training
-    parser.add_argument("--local_rank", type=int, help="local rank for DistributedDataParallel")
 
     # DL Training Hyper-parameters
     parser.add_argument("--epochs", default=100, type=int, help="number of epochs")
@@ -139,10 +137,6 @@ def main(args):
         data_list_file_path=json_path, is_segmentation=False, data_list_key="validation", base_dir=data_root
     )
 
-    # TODO Delete the below print statements
-    print("List of training samples: {}".format(train_list))
-    print("List of validation samples: {}".format(val_list))
-
     print("Total training data are {} and validation data are {}".format(len(train_list), len(val_list)))
 
     train_dataset = CacheDataset(data=train_list, transform=train_transforms, cache_rate=1.0, num_workers=4)
@@ -191,7 +185,7 @@ def main(args):
     optimizer = torch.optim.Adam(model.parameters(), lr=args.base_lr)
 
     model = torch.nn.parallel.DistributedDataParallel(
-        model, device_ids=[args.local_rank], broadcast_buffers=False, find_unused_parameters=True
+        model, device_ids=[int(os.environ["LOCAL_RANK"])], broadcast_buffers=False, find_unused_parameters=True
     )
     model_without_ddp = model.module
 
@@ -340,7 +334,7 @@ def validate(data_loader, model, loss_functions):
     else:
         rank = -1
         world_size = -1
-    torch.cuda.set_device(args.local_rank)
+    torch.cuda.set_device(rank)
     torch.distributed.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank)
     torch.distributed.barrier()