From c6b5c361ee32fe6bcffc3ca80dd1590363f8bd8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mats=20Sj=C3=B6berg?= Date: Tue, 6 Jun 2023 10:55:59 +0300 Subject: [PATCH] Some LUMI fixes --- lumi-memory-bug.py | 17 +++++++++++++++++ pytorch-ddp.sh | 10 +--------- 2 files changed, 18 insertions(+), 9 deletions(-) create mode 100644 lumi-memory-bug.py diff --git a/lumi-memory-bug.py b/lumi-memory-bug.py new file mode 100644 index 0000000..d1bb63c --- /dev/null +++ b/lumi-memory-bug.py @@ -0,0 +1,17 @@ +import torch +import torchvision +import sys + +def main(bs): + batch = torch.rand((bs, 3, 32, 32)).cuda() + model = torchvision.models.resnet18().cuda() + print(f'Feeding batch with size {bs} to model..') + model(batch) + print('Done.') + +if __name__ == '__main__': + bs = 256 + if len(sys.argv) > 1: + bs = int(sys.argv[1]) + main(bs) + diff --git a/pytorch-ddp.sh b/pytorch-ddp.sh index fd3248d..bd8471e 100644 --- a/pytorch-ddp.sh +++ b/pytorch-ddp.sh @@ -5,15 +5,7 @@ SCRIPT="benchmarks/pytorch_visionmodel_ddp.py" IMAGENET_DATA=/scratch/dac/data/ilsvrc2012-torch-resized-new.tar DIST_OPTS="--standalone --master_port 0" -SCRIPT_OPTS="--warmup-steps 100" - -#if [ "$LMOD_FAMILY_PYTHON_ML_ENV" != "pytorch" ] -#then -# echo "WARNING: no pytorch module loaded, loading default module" -# module load pytorch -#fi - -which python3 +SCRIPT_OPTS="--warmup-steps 100 --workers=$SLURM_CPUS_PER_TASK" if [ "$SLURM_NTASKS" -ne "$SLURM_NNODES" ]; then echo "ERROR: this script needs to be run as one task per node."