feat: dmlrun

sehoffmann · Dec 16, 2024 · 209249a · 209249a
1 parent 33fc0e8
commit 209249a
Show file tree

Hide file tree

Showing 3 changed files with 92 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -15,12 +15,30 @@ A torch library for easy distributed deep learning on HPC clusters. Supports bot
 - A wealth of useful utility functions
 
 ## Installation
+dmlcloud can be installed directly from PyPI:
 ```
 pip install dmlcloud
 ```
 
+Alternatively, you can install the latest development version directly from Github:
+```
+pip install git+https://github.com/tangentlabs/django-oscar-paypal.git@issue/34/oscar-0.6
+```
+
 ## Minimal Example
-*TODO*
+[examples/barebone_mnist.py](example/barebone_mnist.py) features a minimal and barebone example on how to distributely train MNIST.
+To run it on a single node with 4 GPUs, use
+```
+dmlrun -n 4 examples/barebone_mnist.py
+```
+
+`dmlrun` is a thin wrapper around `torchrun` that makes development work on a single node easier.
+
+
+To run your training across multiple nodes on a slurm cluster instead, you can simply use `srun`:
+```
+srun --ntasks-per-node [NUM_GPUS] python examples/barebone_mnist.py
+```
 
 ## Documentation
 

diff --git a/dmlcloud/run.py b/dmlcloud/run.py
@@ -0,0 +1,70 @@
+"""
+usage: dmlrun [-h] [--gpus GPUS] [--nprocs NPROCS] script ...
+
+dmlrun is a thin wrapper around torch.distributed.launch that provides a more user-friendly interface.
+
+While torchrun is a powerful tool, it can be a bit clunky to use for testing and debugging. dmlrun aims to make it easier to launch distributed training jobs on a single node.For serious mulit-node training, we recommend using srun or torchrun directly.
+
+positional arguments:
+  script                Path to the script to run.
+  args                  Arguments to pass to the script.
+
+options:
+  -h, --help            show this help message and exit
+  --gpus GPUS, -g GPUS  Comma-seperated list of GPU IDs to use for training. Overrides CUDA_VISIBLE_DEVICES.
+  --nprocs NPROCS, -n NPROCS
+                        Number of GPUs to use for training.
+
+Example:
+    dmlrun --gpus 3,7 train.py
+    dmlrun --num-gpus 2 train.py --batch-size 64
+"""
+
+import argparse
+import os
+
+def main():
+    description = ('dmlrun is a thin wrapper around torch.distributed.launch that provides a more user-friendly interface.\n\n'
+                    'While torchrun is a powerful tool, it can be a bit clunky to use for testing and debugging. dmlrun aims to make it easier to launch distributed training jobs on a single node.'
+                    'For serious mulit-node training, we recommend using srun or torchrun directly.')
+    epilog = ('Example:\n'
+                '    dmlrun --gpus 3,7 train.py\n'
+                '    dmlrun --num-gpus 2 train.py --batch-size 64')
+    parser = argparse.ArgumentParser(prog='dmlrun', description=description, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('--gpus', '-g', help='Comma-seperated list of GPU IDs to use for training. Overrides CUDA_VISIBLE_DEVICES.')
+    parser.add_argument('--nprocs', '-n', type=int, help='Number of GPUs to use for training.')
+    parser.add_argument('script', type=str, help='Path to the script to run.')
+    parser.add_argument('args', nargs=argparse.REMAINDER, help='Arguments to pass to the script.')
+
+    args = parser.parse_args()
+
+    if args.gpus and args.num_gpus:
+        raise ValueError('Only one of --gpus or --num-gpus can be specified.')
+
+
+    if args.gpus:
+        ids = args.gpus.split(',')
+        if not all(id.isdigit() for id in ids):
+            raise ValueError('GPU IDs must be integers.')
+        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus
+        nprocs = len(ids)
+    elif args.nprocs:
+        nprocs = args.nprocs
+    else:
+        nprocs = 1
+
+    import torch.distributed.run
+
+    cmdline = [
+        '--standalone',
+        '--nproc_per_node',
+        f'{nprocs}',
+    ]
+
+    cmdline += [args.script] + args.args
+    print('Executing: torchrun', ' '.join(cmdline), flush=True)
+    torch.distributed.run.main(cmdline)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,9 @@ dynamic = ["version", "readme", "dependencies"]
 [project.urls]
 Repository = "https://github.com/sehoffmann/dmlcloud"
 
+[project.scripts]
+dmlrun = "dmlcloud.run:main"
+
 [tool.setuptools.packages.find]
 include = ["dmlcloud*"]
 namespaces = false