Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes to configurations and fixes to several tests #48

Merged
merged 20 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions configuration/archer2.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def command(self, job):
"--qos=standard",
],
"environs": ["PrgEnv-gnu", "PrgEnv-cray", "PrgEnv-aocc"],
"max_jobs": 16,
"max_jobs": 64,
"processor": {
"num_cpus": 128,
"num_cpus_per_socket": 64,
Expand All @@ -49,6 +49,7 @@ def command(self, job):
{
"name": "compute-gpu",
"descr": "Compute nodes with AMD GPUs",
"max_jobs": 2,
"features": ["gpu"],
"scheduler": "slurm",
"launcher": "srun",
Expand All @@ -68,8 +69,9 @@ def command(self, job):
},
{
"name": "compute-gpu-torch",
"descr": "Compute nodes with AMD GPUs",
"features": ["gpu"],
"descr": "Compute nodes with AMD GPUs, and torch launcher",
"max_jobs": 2,
"features": ["torch"],
"scheduler": "slurm",
"launcher": "torchrun",
"access": ["--partition=gpu"],
Expand Down Expand Up @@ -122,6 +124,7 @@ def command(self, job):
"craype-accel-amd-gfx90a",
"craype-x86-milan",
],
"features": ["gpu"],
"cc": "cc",
"cxx": "CC",
"ftn": "ftn",
Expand All @@ -130,6 +133,7 @@ def command(self, job):
{
"name": "rocm-PrgEnv-cray",
"modules": ["PrgEnv-cray"],
"features": ["gpu"],
"cc": "cc",
"cxx": "CC",
"ftn": "ftn",
Expand All @@ -138,6 +142,7 @@ def command(self, job):
{
"name": "rocm-PrgEnv-aocc",
"modules": ["PrgEnv-aocc"],
"features": ["gpu"],
"cc": "cc",
"cxx": "CC",
"ftn": "ftn",
Expand Down
25 changes: 3 additions & 22 deletions configuration/cirrus.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
"--distribution=block:block",
"--partition=highmem",
],
"max_jobs": 16,
"max_jobs": 2,
"environs": ["gcc", "intel"],
"resources": [
{
Expand All @@ -73,7 +73,7 @@
"--partition=gpu",
],
"max_jobs": 4,
"environs": ["nvidia-mpi"],
"environs": ["Default", "nvidia-mpi"],
"resources": [
{"name": "qos", "options": ["--qos={qos}"]},
{
Expand All @@ -88,26 +88,6 @@
},
"devices": [{"type": "gpu", "num_devices": 4}],
},
{
"name": "compute-gpu-default",
"descr": "Compute nodes with GPUs but doesn't load nvcc compilers or mpi",
"scheduler": "slurm",
"launcher": "srun",
"access": [
"--partition=gpu",
],
"max_jobs": 4,
"environs": ["Default"],
"resources": [
{"name": "qos", "options": ["--qos={qos}"]},
],
"processor": {
"num_cpus": 40,
"num_cpus_per_socket": 20,
"num_sockets": 2,
},
"devices": [{"type": "gpu", "num_devices": 4}],
},
],
}
],
Expand Down Expand Up @@ -139,6 +119,7 @@
},
{
"name": "Default",
"features": ["default"],
"cc": "gcc",
"ftn": "gfortran",
"target_systems": ["cirrus"],
Expand Down
6 changes: 2 additions & 4 deletions tests/apps/lammps/ethanol.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,12 @@ def setup_nnodes(self):
elif self.current_system.name in ["cirrus"]:
self.executable_opts = LAMMPSBaseEthanol.executable_opts + ["-sf gpu -pk gpu 4"]
self.extra_resources["qos"] = {"qos": "short"}
self.num_tasks_per_node = 40
# self.num_tasks_per_node = 40

@run_after("setup")
def setup_gpu_options(self):
"""sets up different resources for gpu systems"""
self.env_vars["PARAMS"] = (
f'"--exclusive --ntasks={self.num_tasks_per_node} --tasks-per-node={self.num_tasks_per_node}"'
)
self.env_vars["PARAMS"] = "--exclusive --ntasks=40 --tasks-per-node=40"
# Cirru slurm demands it be done this way.
# Trying to add $PARAMS directly to job.launcher.options fails.
if self.current_system.name in ["cirrus"]:
Expand Down
12 changes: 9 additions & 3 deletions tests/compile/hello/hello.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ def assert_finished(self):
class HelloTestCPU(HelloTestBase):
"""CPU systems test class"""

valid_systems = ["*"]
valid_prog_environs = ["-gpu"]
valid_systems = ["-gpu"]
valid_prog_environs = ["-gpu -default"]
extra_resources = {
"qos": {"qos": "standard"},
}
Expand All @@ -38,11 +38,17 @@ class HelloTestCPU(HelloTestBase):
class HelloTestGPU(HelloTestBase):
"""GPU systems test class"""

valid_systems = ["+gpu"]
valid_systems = ["-torch"]
valid_prog_environs = ["+gpu"]
extra_resources = {
"qos": {"qos": "gpu"},
"gpu": {"num_gpus_per_node": "1"},
}
num_tasks = None
num_cpus_per_task = None

@run_after("setup")
def setup_gpu_options(self):
"""Change qos for ARCHER2"""
if self.current_system.name in ["archer2"]:
self.extra_resources["qos"]["qos"] = "gpu-shd"
46 changes: 30 additions & 16 deletions tests/mlperf/cosmoflow/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,20 @@ class CosmoFlowGPUBenchmark(CosmoFlowBaseCheck):
"""Cosmoflow GPU benchmark"""

valid_prog_environs = ["Default", "rocm-PrgEnv-gnu"]
valid_systems = ["cirrus:compute-gpu-default", "archer2:compute-gpu-torch"]
valid_systems = ["cirrus:compute-gpu", "archer2:compute-gpu-torch"]
descr = "CosmoFlow GPU Benchmark"

num_tasks = None
num_gpus = parameter([4]) # parameter(1 << pow for pow in range(7))
lbs = parameter([8])
# Due to memory, Cirrus is limited to a lbs of 2
lbs = parameter([2])

time_limit = "1h"
num_nodes = 1

@run_after("init")
def setup_systems(self):
"""Setup environment"""
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml",
"--device",
"cuda",
"-lbs",
f"{self.lbs}",
# "--t_subset_size", "2048",
# "--v_subset_size", "512"
]
if self.current_system.name in ["archer2"]:
self.executable = ""
self.extra_resources = {
Expand All @@ -52,22 +42,46 @@ def setup_systems(self):
"LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD",
"HOME": "$PWD",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml",
"--device",
"cuda",
"-lbs",
"8",
# "--t_subset_size", "2048",
# "--v_subset_size", "512"
]

elif self.current_system.name in ["cirrus"]:
self.executable = "python"
self.extra_resources = {
"qos": {"qos": "gpu"},
}
self.modules = ["openmpi/4.1.5-cuda-11.6"]
self.modules = ["openmpi/4.1.6-cuda-11.6"]
self.prerun_cmds = [
'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"',
"conda activate mlperf_torch",
'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"',
"conda activate torch_mlperf",
]
self.env_vars = {
"OMP_NUM_THREADS": "5",
"SRUN_CPUS_PER_TASK": "5",
"OMPI_MCA_mpi_warn_on_fork": "0",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/cirrus_config.yaml",
"--device",
"cuda",
"--data-dir",
"/work/z04/shared/mlperf-hpc/cosmoflow/mini/cosmoUniverse_2019_05_4parE_tf_v2_mini ",
"-lbs",
"2",
# "--t_subset_size", "2048",
# "--v_subset_size", "512"
]

@run_before("run")
def set_task_distribution(self):
Expand Down
46 changes: 28 additions & 18 deletions tests/mlperf/deepcam/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,20 @@ class DeepCamGPUBenchmark(DeepCamBaseCheck):
"""Class for deepcam tests on gpus"""

valid_prog_environs = ["Default", "rocm-PrgEnv-gnu"]
valid_systems = ["cirrus:compute-gpu-default", "archer2:compute-gpu-torch"]
valid_systems = ["cirrus:compute-gpu", "archer2:compute-gpu-torch"]
descr = "Deepcam GPU Benchmark"

num_tasks = None
num_gpus = parameter([4]) # parameter(1 << pow for pow in range(7))
lbs = parameter([8])
# Due to memory, Cirrus is limited to a lbs of 2
# lbs = parameter([2])

time_limit = "1h"
num_nodes = 1

@run_after("init")
def setup_systems(self):
"""Setup environment"""
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml",
"--device",
"cuda",
"-lbs",
f"{self.lbs}",
# "--t_subset_size", "1024",
# "--v_subset_size", "512"
]
if self.current_system.name in ["archer2"]:
self.executable = ""
self.extra_resources = {
Expand All @@ -52,22 +42,44 @@ def setup_systems(self):
"LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD",
"HOME": "$PWD",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml",
"--device",
"cuda",
"-lbs",
"8",
# "--t_subset_size", "1024",
# "--v_subset_size", "512"
]

elif self.current_system.name in ["cirrus"]:
self.executable = "python"
self.extra_resources = {
"qos": {"qos": "gpu"},
}
self.modules = ["openmpi/4.1.5-cuda-11.6"]
self.modules = ["openmpi/4.1.6-cuda-11.6"]
self.prerun_cmds = [
'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"',
"conda activate mlperf_torch",
'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"',
"conda activate torch_mlperf",
]
self.env_vars = {
"OMP_NUM_THREADS": "5",
"SRUN_CPUS_PER_TASK": "5",
"OMPI_MCA_mpi_warn_on_fork": "0",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/cirrusbenchmark_config.yaml",
"--device",
"cuda",
"-lbs",
"2",
# "--t_subset_size", "1024",
# "--v_subset_size", "512"
]

@run_before("run")
def set_task_distribution(self):
Expand Down Expand Up @@ -96,5 +108,3 @@ def setup_gpu_options(self):
self.job.launcher.options.append(
f"--ntasks={self.num_gpus} --tasks-per-node={self.num_gpus if self.num_gpus <= 4 else 4}"
)

# ----------------------------------------------------------------------------
Loading
Loading