From b79f1d2f7599fbae15cde13c18fa76e9525833f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Mon, 9 Sep 2024 11:52:26 +0100 Subject: [PATCH 01/19] Config changes --- configuration/archer2.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/configuration/archer2.py b/configuration/archer2.py index 1ec2203..6549d9b 100644 --- a/configuration/archer2.py +++ b/configuration/archer2.py @@ -122,6 +122,7 @@ def command(self, job): "craype-accel-amd-gfx90a", "craype-x86-milan", ], + "features": ["gpu"], "cc": "cc", "cxx": "CC", "ftn": "ftn", @@ -130,6 +131,7 @@ def command(self, job): { "name": "rocm-PrgEnv-cray", "modules": ["PrgEnv-cray"], + "features": ["gpu"], "cc": "cc", "cxx": "CC", "ftn": "ftn", @@ -138,6 +140,7 @@ def command(self, job): { "name": "rocm-PrgEnv-aocc", "modules": ["PrgEnv-aocc"], + "features": ["gpu"], "cc": "cc", "cxx": "CC", "ftn": "ftn", From f183deabc12c9f6c06f5ac653d0ad642f1409ad1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Mon, 9 Sep 2024 11:54:35 +0100 Subject: [PATCH 02/19] Changed ARCHER2 gpu partition name in hello.py --- tests/compile/hello/hello.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/compile/hello/hello.py b/tests/compile/hello/hello.py index 2d29921..f70f7b1 100644 --- a/tests/compile/hello/hello.py +++ b/tests/compile/hello/hello.py @@ -1,4 +1,5 @@ """Compilation tests""" + import reframe as rfm import reframe.utility.sanity as sn @@ -45,3 +46,8 @@ class HelloTestGPU(HelloTestBase): } num_tasks = None num_cpus_per_task = None + + @run_after("setup") + def setup_gpu_options(self): + """Change qos for ARCHER2""" + self.extra_resources["qos"]["qos"] = "gpu-shd" From 75cc204fb3a7743e54c43048f1eaabcfd765d64b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Mon, 9 Sep 2024 12:50:51 +0100 Subject: [PATCH 03/19] Added job limits to compute-gpu and compute-gpu-torch --- configuration/archer2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/configuration/archer2.py b/configuration/archer2.py index 6549d9b..f7b1c98 100644 --- a/configuration/archer2.py +++ b/configuration/archer2.py @@ -49,6 +49,7 @@ def command(self, job): { "name": "compute-gpu", "descr": "Compute nodes with AMD GPUs", + "max_jobs": 1, "features": ["gpu"], "scheduler": "slurm", "launcher": "srun", @@ -69,6 +70,7 @@ def command(self, job): { "name": "compute-gpu-torch", "descr": "Compute nodes with AMD GPUs", + "max_jobs": 1, "features": ["gpu"], "scheduler": "slurm", "launcher": "torchrun", From 4f05ab5a5245c697ecf37c368d1b92ffd7e92057 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Mon, 9 Sep 2024 12:54:25 +0100 Subject: [PATCH 04/19] Changed ARCHER2 torch tag --- configuration/archer2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configuration/archer2.py b/configuration/archer2.py index f7b1c98..ae73dc3 100644 --- a/configuration/archer2.py +++ b/configuration/archer2.py @@ -49,7 +49,7 @@ def command(self, job): { "name": "compute-gpu", "descr": "Compute nodes with AMD GPUs", - "max_jobs": 1, + "max_jobs": 2, "features": ["gpu"], "scheduler": "slurm", "launcher": "srun", @@ -70,8 +70,8 @@ def command(self, job): { "name": "compute-gpu-torch", "descr": "Compute nodes with AMD GPUs", - "max_jobs": 1, - "features": ["gpu"], + "max_jobs": 2, + "features": ["torch"], "scheduler": "slurm", "launcher": "torchrun", "access": ["--partition=gpu"], From 58807141a661d71db163b50455a4e59a794a9d39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Mon, 9 Sep 2024 15:18:24 +0100 Subject: [PATCH 05/19] Configuration changes --- configuration/archer2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configuration/archer2.py b/configuration/archer2.py index ae73dc3..783c205 100644 --- a/configuration/archer2.py +++ b/configuration/archer2.py @@ -39,7 +39,7 @@ def command(self, job): "--qos=standard", ], "environs": ["PrgEnv-gnu", "PrgEnv-cray", "PrgEnv-aocc"], - "max_jobs": 16, + "max_jobs": 64, "processor": { "num_cpus": 128, "num_cpus_per_socket": 64, @@ -69,7 +69,7 @@ def command(self, job): }, { "name": "compute-gpu-torch", - "descr": "Compute nodes with AMD GPUs", + "descr": "Compute nodes with AMD GPUs, and torch launcher", "max_jobs": 2, "features": ["torch"], "scheduler": "slurm", From 0f1600cd61e3f75ab73929f1721faed67ca4617d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Mon, 9 Sep 2024 15:19:24 +0100 Subject: [PATCH 06/19] Changed valid_system --- tests/compile/hello/hello.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/compile/hello/hello.py b/tests/compile/hello/hello.py index f70f7b1..5a56571 100644 --- a/tests/compile/hello/hello.py +++ b/tests/compile/hello/hello.py @@ -38,7 +38,7 @@ class HelloTestCPU(HelloTestBase): class HelloTestGPU(HelloTestBase): """GPU systems test class""" - valid_systems = ["+gpu"] + valid_systems = ["-torch"] valid_prog_environs = ["+gpu"] extra_resources = { "qos": {"qos": "gpu"}, From 53fc28c4cf06e06a4b281942f5c74260e732dd7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Mon, 9 Sep 2024 15:27:02 +0100 Subject: [PATCH 07/19] Removed cirrus:compute-gpu-default --- configuration/cirrus.py | 26 ++------------------------ tests/mlperf/cosmoflow/gpu.py | 2 +- tests/mlperf/deepcam/gpu.py | 2 +- tests/mlperf/resnet50/gpu.py | 2 +- 4 files changed, 5 insertions(+), 27 deletions(-) diff --git a/configuration/cirrus.py b/configuration/cirrus.py index 3fa1475..3066089 100644 --- a/configuration/cirrus.py +++ b/configuration/cirrus.py @@ -50,7 +50,7 @@ "--partition=highmem", ], "max_jobs": 16, - "environs": ["gnu", "intel"], + "environs": ["gcc", "intel"], "resources": [ { "name": "qos", @@ -73,7 +73,7 @@ "--partition=gpu", ], "max_jobs": 4, - "environs": ["nvidia-mpi"], + "environs": ["Default", "nvidia-mpi"], "resources": [ {"name": "qos", "options": ["--qos={qos}"]}, { @@ -90,28 +90,6 @@ {"type": "gpu", "num_devices": 4} ] }, - { - "name": "compute-gpu-default", - "descr": "Compute nodes with GPUs but doesn't load nvcc compilers or mpi", - "scheduler": "slurm", - "launcher": "srun", - "access": [ - "--partition=gpu", - ], - "max_jobs": 4, - "environs": ["Default"], - "resources": [ - {"name": "qos", "options": ["--qos={qos}"]}, - ], - "processor": { - "num_cpus": 40, - "num_cpus_per_socket": 20, - "num_sockets": 2, - }, - "devices": [ - {"type": "gpu", "num_devices": 4} - ] - }, ], } ], diff --git a/tests/mlperf/cosmoflow/gpu.py b/tests/mlperf/cosmoflow/gpu.py index ba39651..b447f8f 100644 --- a/tests/mlperf/cosmoflow/gpu.py +++ b/tests/mlperf/cosmoflow/gpu.py @@ -12,7 +12,7 @@ class CosmoFlowGPUBenchmark(CosmoFlowBaseCheck): """Cosmoflow GPU benchmark""" valid_prog_environs = ["Default", "rocm-PrgEnv-gnu"] - valid_systems = ["cirrus:compute-gpu-default", "archer2:compute-gpu-torch"] + valid_systems = ["cirrus:compute-gpu", "archer2:compute-gpu-torch"] descr = "CosmoFlow GPU Benchmark" num_tasks = None diff --git a/tests/mlperf/deepcam/gpu.py b/tests/mlperf/deepcam/gpu.py index 574921a..d4cd288 100644 --- a/tests/mlperf/deepcam/gpu.py +++ b/tests/mlperf/deepcam/gpu.py @@ -12,7 +12,7 @@ class DeepCamGPUBenchmark(DeepCamBaseCheck): """Class for deepcam tests on gpus""" valid_prog_environs = ["Default", "rocm-PrgEnv-gnu"] - valid_systems = ["cirrus:compute-gpu-default", "archer2:compute-gpu-torch"] + valid_systems = ["cirrus:compute-gpu", "archer2:compute-gpu-torch"] descr = "Deepcam GPU Benchmark" num_tasks = None diff --git a/tests/mlperf/resnet50/gpu.py b/tests/mlperf/resnet50/gpu.py index f5c27ed..76472a6 100644 --- a/tests/mlperf/resnet50/gpu.py +++ b/tests/mlperf/resnet50/gpu.py @@ -12,7 +12,7 @@ class ResNet50GPUBenchmark(ResNet50BaseCheck): """Restnet50 test class for GPUs""" valid_prog_environs = ["Default", "rocm-PrgEnv-gnu"] - valid_systems = ["cirrus:compute-gpu-default", "archer2:compute-gpu-torch"] + valid_systems = ["cirrus:compute-gpu", "archer2:compute-gpu-torch"] descr = "ResNet50 GPU Benchmark" num_tasks = None From 97102782ee7b35e2532fc7ec927149910622ab2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Mon, 9 Sep 2024 15:37:18 +0100 Subject: [PATCH 08/19] Cirrus config changes --- configuration/cirrus.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/configuration/cirrus.py b/configuration/cirrus.py index 3066089..7d5327c 100644 --- a/configuration/cirrus.py +++ b/configuration/cirrus.py @@ -122,6 +122,8 @@ { "name": "Default", "cc": "gcc", + "cxx": "gcc", + "ftn": "gfortran", "target_systems": ["cirrus"], }, ], From 2b388a32f25381a41f33ce15e9b20946517c0a4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Mon, 9 Sep 2024 15:37:48 +0100 Subject: [PATCH 09/19] Change in valid_system --- tests/compile/hello/hello.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/compile/hello/hello.py b/tests/compile/hello/hello.py index 5a56571..6d37282 100644 --- a/tests/compile/hello/hello.py +++ b/tests/compile/hello/hello.py @@ -27,7 +27,7 @@ def assert_finished(self): class HelloTestCPU(HelloTestBase): """CPU systems test class""" - valid_systems = ["*"] + valid_systems = ["-gpu"] valid_prog_environs = ["-gpu"] extra_resources = { "qos": {"qos": "standard"}, From 07329f8aa6d271c5bcc60483255d03f2e6173d10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Mon, 9 Sep 2024 15:39:46 +0100 Subject: [PATCH 10/19] Fixed gpu qos change if running in A2 --- tests/compile/hello/hello.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/compile/hello/hello.py b/tests/compile/hello/hello.py index 6d37282..72a4c1e 100644 --- a/tests/compile/hello/hello.py +++ b/tests/compile/hello/hello.py @@ -50,4 +50,5 @@ class HelloTestGPU(HelloTestBase): @run_after("setup") def setup_gpu_options(self): """Change qos for ARCHER2""" - self.extra_resources["qos"]["qos"] = "gpu-shd" + if self.current_system.name in ["archer2"]: + self.extra_resources["qos"]["qos"] = "gpu-shd" From 655c0cf782bad49934bc92f3a8dffdb9ef2c579b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Mon, 9 Sep 2024 16:11:32 +0100 Subject: [PATCH 11/19] Cirrus config changes --- configuration/cirrus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configuration/cirrus.py b/configuration/cirrus.py index 7d5327c..b1ea2f5 100644 --- a/configuration/cirrus.py +++ b/configuration/cirrus.py @@ -49,7 +49,7 @@ "--distribution=block:block", "--partition=highmem", ], - "max_jobs": 16, + "max_jobs": 2, "environs": ["gcc", "intel"], "resources": [ { @@ -121,8 +121,8 @@ }, { "name": "Default", + "features": ["default"], "cc": "gcc", - "cxx": "gcc", "ftn": "gfortran", "target_systems": ["cirrus"], }, From ad9519d5707b9ab8e8e5aaa9e4232a81963f7aa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Mon, 9 Sep 2024 16:11:58 +0100 Subject: [PATCH 12/19] Removed default partition from hello_world test --- tests/compile/hello/hello.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/compile/hello/hello.py b/tests/compile/hello/hello.py index 72a4c1e..7ee9f68 100644 --- a/tests/compile/hello/hello.py +++ b/tests/compile/hello/hello.py @@ -28,7 +28,7 @@ class HelloTestCPU(HelloTestBase): """CPU systems test class""" valid_systems = ["-gpu"] - valid_prog_environs = ["-gpu"] + valid_prog_environs = ["-gpu -default"] extra_resources = { "qos": {"qos": "standard"}, } From 443ef437d46a3ab1e8164ebbb2951ba6096f7d9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Tue, 24 Sep 2024 10:30:08 +0100 Subject: [PATCH 13/19] config typo fix --- configuration/cirrus.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/configuration/cirrus.py b/configuration/cirrus.py index 35fee23..4dcdcd3 100644 --- a/configuration/cirrus.py +++ b/configuration/cirrus.py @@ -88,30 +88,6 @@ }, "devices": [{"type": "gpu", "num_devices": 4}], }, -<<<<<<< HEAD -||||||| 09644b2 - { - "name": "compute-gpu-default", - "descr": "Compute nodes with GPUs but doesn't load nvcc compilers or mpi", - "scheduler": "slurm", - "launcher": "srun", - "access": [ - "--partition=gpu", - ], - "max_jobs": 4, - "environs": ["Default"], - "resources": [ - {"name": "qos", "options": ["--qos={qos}"]}, - ], - "processor": { - "num_cpus": 40, - "num_cpus_per_socket": 20, - "num_sockets": 2, - }, - "devices": [ - {"type": "gpu", "num_devices": 4} - ] - }, ], } ], From 614aa8925b2035c0762ef42d0afe868cbc0baae8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Tue, 24 Sep 2024 11:04:13 +0100 Subject: [PATCH 14/19] openmpi version --- tests/mlperf/cosmoflow/gpu.py | 2 +- tests/mlperf/deepcam/gpu.py | 2 +- tests/mlperf/resnet50/gpu.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/mlperf/cosmoflow/gpu.py b/tests/mlperf/cosmoflow/gpu.py index b447f8f..4518b6a 100644 --- a/tests/mlperf/cosmoflow/gpu.py +++ b/tests/mlperf/cosmoflow/gpu.py @@ -58,7 +58,7 @@ def setup_systems(self): self.extra_resources = { "qos": {"qos": "gpu"}, } - self.modules = ["openmpi/4.1.5-cuda-11.6"] + self.modules = ["openmpi/4.1.6-cuda-11.6"] self.prerun_cmds = [ 'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"', "conda activate mlperf_torch", diff --git a/tests/mlperf/deepcam/gpu.py b/tests/mlperf/deepcam/gpu.py index d4cd288..30ceecd 100644 --- a/tests/mlperf/deepcam/gpu.py +++ b/tests/mlperf/deepcam/gpu.py @@ -58,7 +58,7 @@ def setup_systems(self): self.extra_resources = { "qos": {"qos": "gpu"}, } - self.modules = ["openmpi/4.1.5-cuda-11.6"] + self.modules = ["openmpi/4.1.6-cuda-11.6"] self.prerun_cmds = [ 'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"', "conda activate mlperf_torch", diff --git a/tests/mlperf/resnet50/gpu.py b/tests/mlperf/resnet50/gpu.py index 76472a6..21ba687 100644 --- a/tests/mlperf/resnet50/gpu.py +++ b/tests/mlperf/resnet50/gpu.py @@ -63,7 +63,7 @@ def setup_systems(self): self.extra_resources = { "qos": {"qos": "gpu"}, } - self.modules = ["openmpi/4.1.5-cuda-11.6"] + self.modules = ["openmpi/4.1.6-cuda-11.6"] self.prerun_cmds = [ 'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"', "conda activate mlperf_torch", From e567a299bb7e26403d263f112050fcd266b36b1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Tue, 24 Sep 2024 11:21:16 +0100 Subject: [PATCH 15/19] Fix lammps test --- tests/apps/lammps/ethanol.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/apps/lammps/ethanol.py b/tests/apps/lammps/ethanol.py index 63960e6..9f8f413 100644 --- a/tests/apps/lammps/ethanol.py +++ b/tests/apps/lammps/ethanol.py @@ -94,13 +94,13 @@ def setup_nnodes(self): elif self.current_system.name in ["cirrus"]: self.executable_opts = LAMMPSBaseEthanol.executable_opts + ["-sf gpu -pk gpu 4"] self.extra_resources["qos"] = {"qos": "short"} - self.num_tasks_per_node = 40 + # self.num_tasks_per_node = 40 @run_after("setup") def setup_gpu_options(self): """sets up different resources for gpu systems""" self.env_vars["PARAMS"] = ( - f'"--exclusive --ntasks={self.num_tasks_per_node} --tasks-per-node={self.num_tasks_per_node}"' + f'"--exclusive --ntasks=40 --tasks-per-node=40"' ) # Cirru slurm demands it be done this way. # Trying to add $PARAMS directly to job.launcher.options fails. From 058ba46481379291bc0551c17435c923b3ba2feb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Tue, 24 Sep 2024 13:58:36 +0100 Subject: [PATCH 16/19] Lammps/ethanol linting --- tests/apps/lammps/ethanol.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/apps/lammps/ethanol.py b/tests/apps/lammps/ethanol.py index 9f8f413..60a9a15 100644 --- a/tests/apps/lammps/ethanol.py +++ b/tests/apps/lammps/ethanol.py @@ -99,9 +99,7 @@ def setup_nnodes(self): @run_after("setup") def setup_gpu_options(self): """sets up different resources for gpu systems""" - self.env_vars["PARAMS"] = ( - f'"--exclusive --ntasks=40 --tasks-per-node=40"' - ) + self.env_vars["PARAMS"] = "--exclusive --ntasks=40 --tasks-per-node=40" # Cirru slurm demands it be done this way. # Trying to add $PARAMS directly to job.launcher.options fails. if self.current_system.name in ["cirrus"]: From 49d7188aa85b519ecf1f9ae0bf988d5ad2de36bd Mon Sep 17 00:00:00 2001 From: user name Date: Tue, 24 Sep 2024 16:31:59 +0100 Subject: [PATCH 17/19] Updates envs and configs for MLPerf tests on Cirrus --- tests/mlperf/cosmoflow/gpu.py | 45 +++++++++++++++++-------- tests/mlperf/deepcam/gpu.py | 41 +++++++++++++++-------- tests/mlperf/resnet50/gpu.py | 54 +++++++++++++++++++----------- tests/mlperf/resnet50/graphcore.py | 2 +- 4 files changed, 94 insertions(+), 48 deletions(-) diff --git a/tests/mlperf/cosmoflow/gpu.py b/tests/mlperf/cosmoflow/gpu.py index 4518b6a..56d38a1 100644 --- a/tests/mlperf/cosmoflow/gpu.py +++ b/tests/mlperf/cosmoflow/gpu.py @@ -17,7 +17,9 @@ class CosmoFlowGPUBenchmark(CosmoFlowBaseCheck): num_tasks = None num_gpus = parameter([4]) # parameter(1 << pow for pow in range(7)) - lbs = parameter([8]) + # Due to memory, Cirrus is limited to a lbs of 2 + lbs = parameter([2]) + time_limit = "1h" num_nodes = 1 @@ -25,17 +27,6 @@ class CosmoFlowGPUBenchmark(CosmoFlowBaseCheck): @run_after("init") def setup_systems(self): """Setup environment""" - self.executable_opts = [ - "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py", - "--config", - "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml", - "--device", - "cuda", - "-lbs", - f"{self.lbs}", - # "--t_subset_size", "2048", - # "--v_subset_size", "512" - ] if self.current_system.name in ["archer2"]: self.executable = "" self.extra_resources = { @@ -52,6 +43,18 @@ def setup_systems(self): "LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD", "HOME": "$PWD", } + self.executable_opts = [ + "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py", + "--config", + "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml", + "--device", + "cuda", + "-lbs", + "8", + # "--t_subset_size", "2048", + # "--v_subset_size", "512" + ] + elif self.current_system.name in ["cirrus"]: self.executable = "python" @@ -60,14 +63,28 @@ def setup_systems(self): } self.modules = ["openmpi/4.1.6-cuda-11.6"] self.prerun_cmds = [ - 'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"', - "conda activate mlperf_torch", + 'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"', + "conda activate torch_mlperf", ] self.env_vars = { "OMP_NUM_THREADS": "5", "SRUN_CPUS_PER_TASK": "5", "OMPI_MCA_mpi_warn_on_fork": "0", } + self.executable_opts = [ + "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py", + "--config", + "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/cirrus_config.yaml", + "--device", + "cuda", + "--data-dir", + "/work/z04/shared/mlperf-hpc/cosmoflow/mini/cosmoUniverse_2019_05_4parE_tf_v2_mini " + "-lbs", + "2", + # "--t_subset_size", "2048", + # "--v_subset_size", "512" + ] + @run_before("run") def set_task_distribution(self): diff --git a/tests/mlperf/deepcam/gpu.py b/tests/mlperf/deepcam/gpu.py index 30ceecd..39a917a 100644 --- a/tests/mlperf/deepcam/gpu.py +++ b/tests/mlperf/deepcam/gpu.py @@ -17,7 +17,8 @@ class DeepCamGPUBenchmark(DeepCamBaseCheck): num_tasks = None num_gpus = parameter([4]) # parameter(1 << pow for pow in range(7)) - lbs = parameter([8]) + # Due to memory, Cirrus is limited to a lbs of 2 + # lbs = parameter([2]) time_limit = "1h" num_nodes = 1 @@ -25,17 +26,6 @@ class DeepCamGPUBenchmark(DeepCamBaseCheck): @run_after("init") def setup_systems(self): """Setup environment""" - self.executable_opts = [ - "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py", - "--config", - "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml", - "--device", - "cuda", - "-lbs", - f"{self.lbs}", - # "--t_subset_size", "1024", - # "--v_subset_size", "512" - ] if self.current_system.name in ["archer2"]: self.executable = "" self.extra_resources = { @@ -52,6 +42,18 @@ def setup_systems(self): "LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD", "HOME": "$PWD", } + self.executable_opts = [ + "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py", + "--config", + "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml", + "--device", + "cuda", + "-lbs", + "8", + # "--t_subset_size", "1024", + # "--v_subset_size", "512" + ] + elif self.current_system.name in ["cirrus"]: self.executable = "python" @@ -60,14 +62,25 @@ def setup_systems(self): } self.modules = ["openmpi/4.1.6-cuda-11.6"] self.prerun_cmds = [ - 'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"', - "conda activate mlperf_torch", + 'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"', + "conda activate torch_mlperf", ] self.env_vars = { "OMP_NUM_THREADS": "5", "SRUN_CPUS_PER_TASK": "5", "OMPI_MCA_mpi_warn_on_fork": "0", } + self.executable_opts = [ + "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py", + "--config", + "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/cirrusbenchmark_config.yaml", + "--device", + "cuda", + "-lbs", + "2", + # "--t_subset_size", "1024", + # "--v_subset_size", "512" + ] @run_before("run") def set_task_distribution(self): diff --git a/tests/mlperf/resnet50/gpu.py b/tests/mlperf/resnet50/gpu.py index 21ba687..7e64bbd 100644 --- a/tests/mlperf/resnet50/gpu.py +++ b/tests/mlperf/resnet50/gpu.py @@ -17,7 +17,8 @@ class ResNet50GPUBenchmark(ResNet50BaseCheck): num_tasks = None num_gpus = parameter([4]) - lbs = parameter([8]) + # Due to memory, Cirrus is limited to a lbs of 2 + # lbs = parameter([8]) time_limit = "1h" num_nodes = 1 @@ -25,19 +26,6 @@ class ResNet50GPUBenchmark(ResNet50BaseCheck): @run_after("init") def setup_systems(self): """Environment setup""" - self.executable_opts = [ - "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/train.py", - "--config", - "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/archer2benchmark_config.yaml", - "--device", - "cuda", - "-lbs", - f"{self.lbs}", - "--t_subset_size", - "2048", - "--v_subset_size", - "512", - ] if self.current_system.name in ["archer2"]: self.executable = "" self.extra_resources = { @@ -54,25 +42,53 @@ def setup_systems(self): "LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD", "HOME": "$PWD", } + self.executable_opts = [ + "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/train.py", + "--config", + "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/archer2benchmark_config.yaml", + "--device", + "cuda", + "-lbs", + "8", + "--t_subset_size", + "2048", + "--v_subset_size", + "512", + ] + elif self.current_system.name in ["cirrus"]: - self.executable_opts[2] = ( - "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml", - ) + # self.executable_opts[2] = ( + # "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml", + # ) self.executable = "python" self.extra_resources = { "qos": {"qos": "gpu"}, } self.modules = ["openmpi/4.1.6-cuda-11.6"] self.prerun_cmds = [ - 'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"', - "conda activate mlperf_torch", + 'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"', + "conda activate torch_mlperf", ] self.env_vars = { "OMP_NUM_THREADS": "5", "SRUN_CPUS_PER_TASK": "5", "OMPI_MCA_mpi_warn_on_fork": "0", } + self.executable_opts = [ + "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/train.py", + "--config", + "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml", + "--device", + "cuda", + "-lbs", + "2", + "--t_subset_size", + "2048", + "--v_subset_size", + "512", + ] + @run_before("run") def set_task_distribution(self): diff --git a/tests/mlperf/resnet50/graphcore.py b/tests/mlperf/resnet50/graphcore.py index a77b62a..58fc8a0 100644 --- a/tests/mlperf/resnet50/graphcore.py +++ b/tests/mlperf/resnet50/graphcore.py @@ -8,7 +8,7 @@ @rfm.simple_test -class ResNetGPUServiceBenchmark(ResNet50BaseCheck): +class ResNetGPUServiceGraphCoreBenchmark(ResNet50BaseCheck): """Resnet50 test class for graphcore""" valid_prog_environs = ["*"] From 654fc5b9592cdef989ad95c1bc5671c0d03b9842 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Wed, 25 Sep 2024 14:49:39 +0100 Subject: [PATCH 18/19] Linting --- tests/mlperf/cosmoflow/gpu.py | 8 ++------ tests/mlperf/deepcam/gpu.py | 3 --- tests/mlperf/resnet50/gpu.py | 2 -- 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/tests/mlperf/cosmoflow/gpu.py b/tests/mlperf/cosmoflow/gpu.py index 56d38a1..1db3b70 100644 --- a/tests/mlperf/cosmoflow/gpu.py +++ b/tests/mlperf/cosmoflow/gpu.py @@ -20,7 +20,6 @@ class CosmoFlowGPUBenchmark(CosmoFlowBaseCheck): # Due to memory, Cirrus is limited to a lbs of 2 lbs = parameter([2]) - time_limit = "1h" num_nodes = 1 @@ -55,7 +54,6 @@ def setup_systems(self): # "--v_subset_size", "512" ] - elif self.current_system.name in ["cirrus"]: self.executable = "python" self.extra_resources = { @@ -77,15 +75,13 @@ def setup_systems(self): "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/cirrus_config.yaml", "--device", "cuda", - "--data-dir", - "/work/z04/shared/mlperf-hpc/cosmoflow/mini/cosmoUniverse_2019_05_4parE_tf_v2_mini " - "-lbs", + "--data-dir", + "/work/z04/shared/mlperf-hpc/cosmoflow/mini/cosmoUniverse_2019_05_4parE_tf_v2_mini " "-lbs", "2", # "--t_subset_size", "2048", # "--v_subset_size", "512" ] - @run_before("run") def set_task_distribution(self): """Setup task distribution""" diff --git a/tests/mlperf/deepcam/gpu.py b/tests/mlperf/deepcam/gpu.py index 39a917a..6c5f3ef 100644 --- a/tests/mlperf/deepcam/gpu.py +++ b/tests/mlperf/deepcam/gpu.py @@ -54,7 +54,6 @@ def setup_systems(self): # "--v_subset_size", "512" ] - elif self.current_system.name in ["cirrus"]: self.executable = "python" self.extra_resources = { @@ -109,5 +108,3 @@ def setup_gpu_options(self): self.job.launcher.options.append( f"--ntasks={self.num_gpus} --tasks-per-node={self.num_gpus if self.num_gpus <= 4 else 4}" ) - - # ---------------------------------------------------------------------------- diff --git a/tests/mlperf/resnet50/gpu.py b/tests/mlperf/resnet50/gpu.py index 7e64bbd..3dba5f5 100644 --- a/tests/mlperf/resnet50/gpu.py +++ b/tests/mlperf/resnet50/gpu.py @@ -56,7 +56,6 @@ def setup_systems(self): "512", ] - elif self.current_system.name in ["cirrus"]: # self.executable_opts[2] = ( # "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml", @@ -89,7 +88,6 @@ def setup_systems(self): "512", ] - @run_before("run") def set_task_distribution(self): """Setup task distribution""" From 0c5cae9eb75870f52c7779a97fa270d440b06ce7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Ap=C3=B3stolo?= Date: Wed, 25 Sep 2024 14:53:38 +0100 Subject: [PATCH 19/19] Linting part 2: electric boogaloo --- tests/mlperf/cosmoflow/gpu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/mlperf/cosmoflow/gpu.py b/tests/mlperf/cosmoflow/gpu.py index 1db3b70..279ac80 100644 --- a/tests/mlperf/cosmoflow/gpu.py +++ b/tests/mlperf/cosmoflow/gpu.py @@ -76,7 +76,8 @@ def setup_systems(self): "--device", "cuda", "--data-dir", - "/work/z04/shared/mlperf-hpc/cosmoflow/mini/cosmoUniverse_2019_05_4parE_tf_v2_mini " "-lbs", + "/work/z04/shared/mlperf-hpc/cosmoflow/mini/cosmoUniverse_2019_05_4parE_tf_v2_mini ", + "-lbs", "2", # "--t_subset_size", "2048", # "--v_subset_size", "512"