From a754ba9c4244cd7b0f4525e76032d45f7718cb16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mats=20Sj=C3=B6berg?= Date: Mon, 18 Sep 2023 12:48:24 +0300 Subject: [PATCH] Implemented --steps in lightning benchmarch, Puhti results --- benchmarks/pytorch_visionmodel_lightning.py | 3 ++- pytorch-ddp-lightning.sh | 2 +- results.md | 16 ++++++++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/benchmarks/pytorch_visionmodel_lightning.py b/benchmarks/pytorch_visionmodel_lightning.py index d297f4a..9d95677 100644 --- a/benchmarks/pytorch_visionmodel_lightning.py +++ b/benchmarks/pytorch_visionmodel_lightning.py @@ -74,6 +74,7 @@ def train(args): accelerator='gpu', strategy='ddp', precision=precision, + max_steps=args.steps, callbacks=[BenchmarkingCallback(args.warmup_steps, args.batchsize, world_size)]) @@ -115,7 +116,7 @@ def main(): help='Batch size') parser.add_argument('-j', '--workers', type=int, default=10, help='Number of data loader workers') - parser.add_argument('--steps', type=int, required=False, + parser.add_argument('--steps', type=int, required=False, default=-1, help='Maxium number of training steps') parser.add_argument('--warmup-steps', type=int, default=10, help='Number of initial steps to ignore in average') diff --git a/pytorch-ddp-lightning.sh b/pytorch-ddp-lightning.sh index 2770ae5..f75ec4f 100644 --- a/pytorch-ddp-lightning.sh +++ b/pytorch-ddp-lightning.sh @@ -4,7 +4,7 @@ export NCCL_DEBUG=INFO SCRIPT="benchmarks/pytorch_visionmodel_lightning.py" IMAGENET_DATA=/scratch/dac/data/ilsvrc2012-torch-resized-new.tar -SCRIPT_OPTS="--strategy=ddp --warmup-steps 100 --workers=$SLURM_CPUS_PER_TASK" +SCRIPT_OPTS="--strategy=ddp --warmup-steps 10 --workers=$SLURM_CPUS_PER_TASK" if [ $(( $NUM_GPUS * $SLURM_NNODES )) -ne $SLURM_NTASKS ]; then echo "ERROR: this script needs to be run as one task per GPU. Try using slurm/*-mpi.sh scripts." diff --git a/results.md b/results.md index 48996de..e1cb2d4 100644 --- a/results.md +++ b/results.md @@ -71,3 +71,19 @@ | DeepSpeed, synthetic data | PyTorch 2.0.0+cu117 | mahti | 8 | 2023-09-15 | 5813.62 | | Horovod, synthetic | PyTorch 2.0.0+cu117 | mahti | 8 | 2023-09-15 | 5235.30 | | Horovod, Imagenet data | PyTorch 2.0.0+cu117 | mahti | 8 | 2023-09-15 | 5230.77 | +| DDP, synthetic | PyTorch 2.0.0+cu117 | puhti | 1 | 2023-09-16 | 331.39 | +| DDP, synthetic | PyTorch 2.0.0+cu117 | puhti | 4 | 2023-09-16 | 1245.59 | +| DDP, synthetic | PyTorch 2.0.0+cu117 | puhti | 8 | 2023-09-16 | 2473.86 | +| DDP, synthetic, fp16 | PyTorch 2.0.0+cu117 | puhti | 1 | 2023-09-16 | 674.17 | +| DDP, synthetic, fp16 | PyTorch 2.0.0+cu117 | puhti | 4 | 2023-09-16 | 2389.34 | +| DDP, synthetic, fp16 | PyTorch 2.0.0+cu117 | puhti | 8 | 2023-09-16 | 4644.40 | +| DDP Lightning, synthetic | PyTorch 2.0.0+cu117 | puhti | 1 | 2023-09-16 | 331.98 | +| DDP Lightning, synthetic | PyTorch 2.0.0+cu117 | puhti | 4 | 2023-09-16 | 1254.01 | +| DDP Lightning, synthetic | PyTorch 2.0.0+cu117 | puhti | 8 | 2023-09-16 | 2488.49 | +| DDP, Imagenet data | PyTorch 2.0.0+cu117 | puhti | 1 | 2023-09-16 | 329.76 | +| DDP, Imagenet data | PyTorch 2.0.0+cu117 | puhti | 4 | 2023-09-16 | 1244.49 | +| DDP, Imagenet data | PyTorch 2.0.0+cu117 | puhti | 8 | 2023-09-16 | 2470.56 | +| DeepSpeed, synthetic data | PyTorch 2.0.0+cu117 | puhti | 4 | 2023-09-16 | 1262.18 | +| DeepSpeed, synthetic data | PyTorch 2.0.0+cu117 | puhti | 8 | 2023-09-16 | 2429.24 | +| Horovod, synthetic | PyTorch 2.0.0+cu117 | puhti | 8 | 2023-09-16 | 2314.87 | +| Horovod, Imagenet data | PyTorch 2.0.0+cu117 | puhti | 8 | 2023-09-16 | 2313.93 |