diff --git a/README.md b/README.md index 45c3540..87cfc21 100644 --- a/README.md +++ b/README.md @@ -15,12 +15,30 @@ A torch library for easy distributed deep learning on HPC clusters. Supports bot - A wealth of useful utility functions ## Installation +dmlcloud can be installed directly from PyPI: ``` pip install dmlcloud ``` +Alternatively, you can install the latest development version directly from Github: +``` +pip install git+https://github.com/tangentlabs/django-oscar-paypal.git@issue/34/oscar-0.6 +``` + ## Minimal Example -*TODO* +[examples/barebone_mnist.py](example/barebone_mnist.py) features a minimal and barebone example on how to distributely train MNIST. +To run it on a single node with 4 GPUs, use +``` +dmlrun -n 4 examples/barebone_mnist.py +``` + +`dmlrun` is a thin wrapper around `torchrun` that makes development work on a single node easier. + + +To run your training across multiple nodes on a slurm cluster instead, you can simply use `srun`: +``` +srun --ntasks-per-node [NUM_GPUS] python examples/barebone_mnist.py +``` ## Documentation diff --git a/dmlcloud/run.py b/dmlcloud/run.py new file mode 100644 index 0000000..0f56aec --- /dev/null +++ b/dmlcloud/run.py @@ -0,0 +1,70 @@ +""" +usage: dmlrun [-h] [--gpus GPUS] [--nprocs NPROCS] script ... + +dmlrun is a thin wrapper around torch.distributed.launch that provides a more user-friendly interface. + +While torchrun is a powerful tool, it can be a bit clunky to use for testing and debugging. dmlrun aims to make it easier to launch distributed training jobs on a single node.For serious mulit-node training, we recommend using srun or torchrun directly. + +positional arguments: + script Path to the script to run. + args Arguments to pass to the script. + +options: + -h, --help show this help message and exit + --gpus GPUS, -g GPUS Comma-seperated list of GPU IDs to use for training. Overrides CUDA_VISIBLE_DEVICES. + --nprocs NPROCS, -n NPROCS + Number of GPUs to use for training. + +Example: + dmlrun --gpus 3,7 train.py + dmlrun --num-gpus 2 train.py --batch-size 64 +""" + +import argparse +import os + +def main(): + description = ('dmlrun is a thin wrapper around torch.distributed.launch that provides a more user-friendly interface.\n\n' + 'While torchrun is a powerful tool, it can be a bit clunky to use for testing and debugging. dmlrun aims to make it easier to launch distributed training jobs on a single node.' + 'For serious mulit-node training, we recommend using srun or torchrun directly.') + epilog = ('Example:\n' + ' dmlrun --gpus 3,7 train.py\n' + ' dmlrun --num-gpus 2 train.py --batch-size 64') + parser = argparse.ArgumentParser(prog='dmlrun', description=description, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('--gpus', '-g', help='Comma-seperated list of GPU IDs to use for training. Overrides CUDA_VISIBLE_DEVICES.') + parser.add_argument('--nprocs', '-n', type=int, help='Number of GPUs to use for training.') + parser.add_argument('script', type=str, help='Path to the script to run.') + parser.add_argument('args', nargs=argparse.REMAINDER, help='Arguments to pass to the script.') + + args = parser.parse_args() + + if args.gpus and args.num_gpus: + raise ValueError('Only one of --gpus or --num-gpus can be specified.') + + + if args.gpus: + ids = args.gpus.split(',') + if not all(id.isdigit() for id in ids): + raise ValueError('GPU IDs must be integers.') + os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus + nprocs = len(ids) + elif args.nprocs: + nprocs = args.nprocs + else: + nprocs = 1 + + import torch.distributed.run + + cmdline = [ + '--standalone', + '--nproc_per_node', + f'{nprocs}', + ] + + cmdline += [args.script] + args.args + print('Executing: torchrun', ' '.join(cmdline), flush=True) + torch.distributed.run.main(cmdline) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index f937db8..06b369e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,9 @@ dynamic = ["version", "readme", "dependencies"] [project.urls] Repository = "https://github.com/sehoffmann/dmlcloud" +[project.scripts] +dmlrun = "dmlcloud.run:main" + [tool.setuptools.packages.find] include = ["dmlcloud*"] namespaces = false