diff --git a/apps/gromacs/reframe_gromacs.py b/apps/gromacs/reframe_gromacs.py index 350edfb..83ac79e 100644 --- a/apps/gromacs/reframe_gromacs.py +++ b/apps/gromacs/reframe_gromacs.py @@ -7,16 +7,16 @@ import reframe.utility.sanity as sn import os, sys, urllib, shutil, itertools sys.path.append('.') -from modules.reframe_extras import sequence, Scheduler_Info, CachedRunTest +from modules.reframe_extras import sequence, Scheduler_Info, ScalingTest from modules.utils import parse_time_cmd from reframe.core.logging import getlogger # parameterisation: -node_seq = sequence(1, Scheduler_Info().num_nodes + 1, 2) -benchmarks = ['1400k-atoms', '61k-atoms', '3000k-atoms'] +TEST_SIZES = [-1, -2, 0.25, 0.5, 1.0] # -ve numbers are absolute numbers of nodes, +ve are fraction of total partition nodes +CASES = ['1400k-atoms', '61k-atoms', '3000k-atoms'] -@rfm.parameterized_test(*list(itertools.product(benchmarks, node_seq))) -class Gromacs_HECBioSim(rfm.RunOnlyRegressionTest, CachedRunTest): +@rfm.parameterized_test(*list(itertools.product(CASES, TEST_SIZES))) +class Gromacs_HECBioSim(rfm.RunOnlyRegressionTest, ScalingTest): """ Run HECBioSim Gromacs benchmarks. Runs for environments named "gromacs" only. @@ -26,26 +26,19 @@ class Gromacs_HECBioSim(rfm.RunOnlyRegressionTest, CachedRunTest): Also handles download/unpacking benchmark files. - Set `self.use_cache=True` to re-run results processing without actually running - gromacs again, e.g. during debugging. See `reframe_extras.CachedRunTest` for details. - Args: casename: str, directory name from the HECBioSim benchmark download, one of: 1400k-atoms 20k-atoms 3000k-atoms 465k-atoms 61k-atoms - num_nodes: int, number of nodes to run on + test_size: Number of nodes to use, possibly as fraction of partition size. See `modules.reframe_extras.ScalingTest.partition_fraction`. """ - def __init__(self, casename, num_nodes): + def __init__(self, casename, test_size): - self.name = 'Gromacs_%s_%i' % (casename.split('-')[0], num_nodes) # e.g. Gromacs_1400k_2 - self.use_cache = False # set to True to debug outputs using cached results + self.name = 'Gromacs_%s_%s' % (casename.split('-')[0], test_size) # e.g. Gromacs_1400k_-2 self.casename = casename - self.num_nodes = num_nodes - - # these are the ones reframe uses: - self.num_tasks_per_node = Scheduler_Info().pcores_per_node - self.num_tasks = self.num_nodes * self.num_tasks_per_node - self.tags = {'num_procs=%i' % self.num_tasks, 'num_nodes=%i' % self.num_nodes, casename} + self.partition_fraction = test_size + self.node_fraction = 1.0 # use all physical cores + self.tags = {'test_size=%s' % test_size, casename} self.logfile = casename + '.log' @@ -55,7 +48,7 @@ def __init__(self, casename, num_nodes): # created by download_benchmarks(): self.sourcesdir = os.path.join(self.prefix, 'downloads', 'gromacs', self.casename) - self.pre_run = ['time \\'] + self.prerun_cmds = ['time \\'] self.executable = 'gmx_mpi' self.executable_opts = ['mdrun', '-s', 'benchmark.tpr', '-g', self.logfile, '-noconfout'] #, '-ntomp', '1'] add this to turn off threading self.exclusive_access = True @@ -91,7 +84,7 @@ def __init__(self, casename, num_nodes): # Finished mdrun on rank 0 Wed Jun 3 17:18:58 2020 # # - + @rfm.run_before('run') def download_benchmarks(self, ): """ Download & unpack HECBioSim Gromacs benchmarks diff --git a/modules/reframe_extras.py b/modules/reframe_extras.py index fc05f16..5911d9e 100644 --- a/modules/reframe_extras.py +++ b/modules/reframe_extras.py @@ -11,6 +11,60 @@ import os, shutil, subprocess, shlex, subprocess from pprint import pprint +class ScalingTest(rfm.RegressionTest): + """ Mixin to specify the number of nodes and processes-per-node to use relative to current partition resources. + + Classes deriving from this must set the following to a number: + + - `self.partition_fraction` + - `self.node_fraction` + + If +ve, these give a factor which defines respectively: + - The number of nodes to use, as the number of nodes in the current scheduler partition * this factor + - The number of processes to use per node, as the number of physical cores * this factor + + If -ve, they must be an integer in which case they define the absolute number of nodes or processes to use respectively. + + Note that the current scheduler partition is affected by the (reframe) partition's `access` property. The following `sbatch` directives are + taken into account: + - `--partition`: use a non-default Slurm partition. + - `--exclude`: exclude specific nodes from consideration, i.e. a `partition_fraction` of 1.0 will not use these nodes. + + The following tags are set: + - `num_nodes`: the actual number of nodes used. + - `num_procs`: the actual number of MPI tasks used. + """ + + @rfm.run_before('run') + def set_nodes(self): + + scheduler_partition = Scheduler_Info(self.current_partition) + + # calculate number of nodes to use: + if not hasattr(self, 'partition_fraction'): + raise NameError('test classes derived from %r must define self.partition_fraction' % type(self)) + if self.partition_fraction < 0: + if not isinstance(self.partition_fraction, int): + raise TypeError('invalid self.partition_fraction of %r : -ve values should specify an integer number of nodes' % self.test_size) + self.num_nodes = -1 * self.partition_fraction + else: + self.num_nodes = int(scheduler_partition.num_nodes * self.partition_fraction) + + # calculate number of tasks per node: + if not hasattr(self, 'node_fraction'): + raise NameError('test classes derived from %r must define self.node_fraction' % type(self)) + if self.node_fraction < 0: + if not isinstance(self.node_fraction, int): + raise TypeError('invalid self.node_fraction of %r : -ve values should specify an integer number of cores' % self.test_size) + self.num_tasks_per_node = -1 * self.node_fraction # reframe parameter + else: + self.num_tasks_per_node = int(scheduler_partition.pcores_per_node * self.node_fraction) # reframe parameter + + self.num_tasks = self.num_nodes * self.num_tasks_per_node # reframe parameter + + # set tags: + self.tags |= {'num_procs=%i' % self.num_tasks, 'num_nodes=%i' % self.num_nodes} + class CachedRunTest(rfm.RegressionTest): """ Mixin. TODO: document properly. @@ -162,6 +216,7 @@ def __init__(self, rfm_partition=None, exclude_states=None, only_states=None): - `--partition` - `--exclude` """ + # TODO: handle scheduler not being slurm! slurm_partition_name = None slurm_excluded_nodes = [] diff --git a/reframe_config.py b/reframe_config.py index c695af8..b1e53d4 100644 --- a/reframe_config.py +++ b/reframe_config.py @@ -5,7 +5,8 @@ 'access': [ '--partition=cclake', '--account=support-cpu', - '--exclude=cpu-p-[57-672]', # only use one rack's-worth of nodes at present + '--exclude=cpu-p-[1-56,113-672]', # only use one rack's-worth of nodes at present: use rack 2=57-112 as this is all ok + '--time=1:00:00', ], 'max_jobs': 20, } @@ -47,7 +48,7 @@ **{ 'name': 'cclake-ib-gcc9-ompi3-ucx', 'descr': '100Gb Infiniband using gcc 9.1.0 and openmpi 3.1.6 with UCX', - 'environs': ['sysinfo', 'imb'], + 'environs': ['sysinfo', 'imb', 'gromacs'], 'variables': [ ['SLURM_MPI_TYPE', 'pmix_v3'], # available for ompi3+ ['UCX_NET_DEVICES', 'mlx5_0:1'], # only use IB @@ -59,7 +60,7 @@ **{ 'name': 'cclake-roce-gcc9-ompi3-ucx', 'descr': '50Gb Infiniband using gcc 9.1.0 and openmpi 3.1.6 with UCX', - 'environs': ['sysinfo', 'imb'], + 'environs': ['sysinfo', 'imb', 'gromacs'], 'variables': [ ['SLURM_MPI_TYPE', 'pmix_v3'], # available for ompi3+ ['UCX_NET_DEVICES', 'mlx5_1:1'], # only use RoCE @@ -140,7 +141,7 @@ 'environments': [ { 'name': 'imb', # a non-targeted environment seems to be necessary for reframe to load the config - # will also work for csd3:cclake-ib-icc19-impi19-ucx as the partition module makes this available + # will also work for csd3:cclake-{ib,roce}-icc19-impi19-ucx as the partition's module makes this available }, { 'name': 'imb', @@ -169,7 +170,7 @@ }, { 'name': 'imb', - 'target_systems': ['csd3:cclake-ib-gcc9-ompi3-ucx'], + 'target_systems': ['csd3:cclake-ib-gcc9-ompi3-ucx', 'csd3:cclake-roce-gcc9-ompi3-ucx'], 'modules': ['intel-mpi-benchmarks-2019.6-gcc-9.1.0-5tbknir'] }, { @@ -182,8 +183,8 @@ }, { 'name': 'gromacs', - 'target_systems': ['arcus:ib-gcc9-openmpi4-ucx', 'arcus:roce-gcc9-openmpi4-ucx'], - 'modules': ['gromacs/2016.6-5ltvgvk'] + 'target_systems': ['csd3:cclake-ib-gcc9-ompi3-ucx', 'csd3:cclake-roce-gcc9-ompi3-ucx'], + 'modules': ['gromacs-2016.6-gcc-9.1.0-kgomb67'] }, { 'name': 'omb',