Skip to content

Commit

Permalink
fix: (hopefully) doc
Browse files Browse the repository at this point in the history
  • Loading branch information
sehoffmann committed Dec 16, 2024
1 parent 209249a commit 6e6f6d0
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 12 deletions.
2 changes: 1 addition & 1 deletion .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ sphinx:
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
python:
install:
- requirements: ci_requirements.txt
- requirements: doc/requirements.txt
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ pip install git+https://github.com/tangentlabs/django-oscar-paypal.git@issue/34/
```

## Minimal Example
[examples/barebone_mnist.py](example/barebone_mnist.py) features a minimal and barebone example on how to distributely train MNIST.
See [examples/barebone_mnist.py](https://github.com/sehoffmann/dmlcloud/blob/develop/examples/barebone_mnist.py) for a minimal and barebone example on how to distributely train MNIST.
To run it on a single node with 4 GPUs, use
```
dmlrun -n 4 examples/barebone_mnist.py
Expand Down
24 changes: 14 additions & 10 deletions dmlcloud/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,20 @@
import argparse
import os


def main():
description = ('dmlrun is a thin wrapper around torch.distributed.launch that provides a more user-friendly interface.\n\n'
'While torchrun is a powerful tool, it can be a bit clunky to use for testing and debugging. dmlrun aims to make it easier to launch distributed training jobs on a single node.'
'For serious mulit-node training, we recommend using srun or torchrun directly.')
epilog = ('Example:\n'
' dmlrun --gpus 3,7 train.py\n'
' dmlrun --num-gpus 2 train.py --batch-size 64')
parser = argparse.ArgumentParser(prog='dmlrun', description=description, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('--gpus', '-g', help='Comma-seperated list of GPU IDs to use for training. Overrides CUDA_VISIBLE_DEVICES.')
description = (
'dmlrun is a thin wrapper around torch.distributed.launch that provides a more user-friendly interface.\n\n'
'While torchrun is a powerful tool, it can be a bit clunky to use for testing and debugging. dmlrun aims to make it easier to launch distributed training jobs on a single node.'
'For serious mulit-node training, we recommend using srun or torchrun directly.'
)
epilog = 'Example:\n' ' dmlrun --gpus 3,7 train.py\n' ' dmlrun --num-gpus 2 train.py --batch-size 64'
parser = argparse.ArgumentParser(
prog='dmlrun', description=description, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
'--gpus', '-g', help='Comma-seperated list of GPU IDs to use for training. Overrides CUDA_VISIBLE_DEVICES.'
)
parser.add_argument('--nprocs', '-n', type=int, help='Number of GPUs to use for training.')
parser.add_argument('script', type=str, help='Path to the script to run.')
parser.add_argument('args', nargs=argparse.REMAINDER, help='Arguments to pass to the script.')
Expand All @@ -41,7 +46,6 @@ def main():
if args.gpus and args.num_gpus:
raise ValueError('Only one of --gpus or --num-gpus can be specified.')


if args.gpus:
ids = args.gpus.split(',')
if not all(id.isdigit() for id in ids):
Expand All @@ -67,4 +71,4 @@ def main():


if __name__ == '__main__':
main()
main()
3 changes: 3 additions & 0 deletions doc/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-r ../ci_requirements.txt
-e ./
-r ../requirements.txt

0 comments on commit 6e6f6d0

Please sign in to comment.