From 14529dc9331af72108e3ec3b2bb30a1cb9d8aae0 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Wed, 4 Sep 2024 15:14:13 +0200 Subject: [PATCH 1/4] add hooks for measuring memory usage in a job --- eessi/testsuite/hooks.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index c19e71cf..39b76010 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -7,8 +7,11 @@ import reframe as rfm import reframe.core.logging as rflog +import reframe.utility.sanity as sn -from eessi.testsuite.constants import * +from eessi.testsuite.constants import (ALWAYS_REQUEST_GPUS, COMPUTE_UNIT, CPU, CPU_SOCKET, DEVICE_TYPES, FEATURES, GPU, + GPU_VENDOR, GPU_VENDORS, HWTHREAD, INVALID_SYSTEM, NODE, NUMA_NODE, NVIDIA, + SCALES) from eessi.testsuite.utils import (get_max_avail_gpus_per_node, is_cuda_required_module, log, check_proc_attribute_defined, check_extras_key_defined) @@ -695,3 +698,29 @@ def _check_always_request_gpus(test: rfm.RegressionTest): if FEATURES[ALWAYS_REQUEST_GPUS] in test.current_partition.features and not test.num_gpus_per_node: test.num_gpus_per_node = test.default_num_gpus_per_node log(f'num_gpus_per_node set to {test.num_gpus_per_node} for partition {test.current_partition.name}') + + +def write_memory_usage(test: rfm.RegressionTest): + """ + Write the memory usage into the job output file if we are in a Slurm job and if cgroups is enabled in Slurm + First try to obtain the memory with cgroups v2, if that fails try with cgroups v1 (v2 takes precedence) + Intended to be used in tandem with hook extract_memory() + """ + test.postrun_cmds = [ + 'path_v2=/sys/fs/cgroup/$(\S+)', test.stdout, 'memory', int) From b2c27332b5b37ca242ad4a44898ee207e7ca8dc4 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Wed, 4 Sep 2024 16:15:57 +0200 Subject: [PATCH 2/4] update comment and hook name --- eessi/testsuite/hooks.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index 39b76010..f1658088 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -718,9 +718,13 @@ def write_memory_usage(test: rfm.RegressionTest): ] -def extract_memory(test: rfm.RegressionTest): +def extract_memory_usage(test: rfm.RegressionTest): """ Extract the memory in MiB from the job output file as written by hook write_memory_usage() - Use in a test method with decorator @performance_function('MiB', perf_key='memory') + To Use this hook, add the following method to your test: + + @performance_function('MiB', perf_key='memory') + def memory_usage(self): + return hooks.extract_memory_usage(self) """ return sn.extractsingle(r'^MAX_MEM_IN_MIB=(?P\S+)', test.stdout, 'memory', int) From 317a2a8966e36c053613e04b628d616c8ec6b803 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Wed, 4 Sep 2024 16:42:52 +0200 Subject: [PATCH 3/4] add hooks usage; rename hooks --- eessi/testsuite/hooks.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index f1658088..f2eb0416 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -700,11 +700,17 @@ def _check_always_request_gpus(test: rfm.RegressionTest): log(f'num_gpus_per_node set to {test.num_gpus_per_node} for partition {test.current_partition.name}') -def write_memory_usage(test: rfm.RegressionTest): +def measure_memory_usage(test: rfm.RegressionTest): """ Write the memory usage into the job output file if we are in a Slurm job and if cgroups is enabled in Slurm First try to obtain the memory with cgroups v2, if that fails try with cgroups v1 (v2 takes precedence) - Intended to be used in tandem with hook extract_memory() + Intended to be used in tandem with hook extract_memory_usage() + To use this hook, add the following method to your test: + + @run_after('init') + def measure_memory_usage(self): + "Measure memory usage" + hooks.measure_memory_usage(self) """ test.postrun_cmds = [ 'path_v2=/sys/fs/cgroup/$(\S+)', test.stdout, 'memory', int) From ec13255cc0900aa73387d745494be4361cca00cd Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Wed, 4 Sep 2024 17:07:44 +0200 Subject: [PATCH 4/4] small comment update --- eessi/testsuite/hooks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index f2eb0416..3211f273 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -705,7 +705,7 @@ def measure_memory_usage(test: rfm.RegressionTest): Write the memory usage into the job output file if we are in a Slurm job and if cgroups is enabled in Slurm First try to obtain the memory with cgroups v2, if that fails try with cgroups v1 (v2 takes precedence) Intended to be used in tandem with hook extract_memory_usage() - To use this hook, add the following method to your test: + To use this hook, add the following method to your test class: @run_after('init') def measure_memory_usage(self): @@ -727,7 +727,7 @@ def measure_memory_usage(self): def extract_memory_usage(test: rfm.RegressionTest): """ Extract the memory in MiB from the job output file as written by hook measure_memory_usage() - To Use this hook, add the following method to your test: + To Use this hook, add the following method to your test class: @performance_function('MiB', perf_key='memory') def extract_memory_usage(self):