Skip to content

Commit

Permalink
rejig gromacs + modules + config to scale by proportion of rack #2 nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
sjpb committed Sep 23, 2020
1 parent 8cbeaf0 commit e21a609
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 27 deletions.
33 changes: 13 additions & 20 deletions apps/gromacs/reframe_gromacs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@
import reframe.utility.sanity as sn
import os, sys, urllib, shutil, itertools
sys.path.append('.')
from modules.reframe_extras import sequence, Scheduler_Info, CachedRunTest
from modules.reframe_extras import sequence, Scheduler_Info, ScalingTest
from modules.utils import parse_time_cmd
from reframe.core.logging import getlogger

# parameterisation:
node_seq = sequence(1, Scheduler_Info().num_nodes + 1, 2)
benchmarks = ['1400k-atoms', '61k-atoms', '3000k-atoms']
TEST_SIZES = [-1, -2, 0.25, 0.5, 1.0] # -ve numbers are absolute numbers of nodes, +ve are fraction of total partition nodes
CASES = ['1400k-atoms', '61k-atoms', '3000k-atoms']

@rfm.parameterized_test(*list(itertools.product(benchmarks, node_seq)))
class Gromacs_HECBioSim(rfm.RunOnlyRegressionTest, CachedRunTest):
@rfm.parameterized_test(*list(itertools.product(CASES, TEST_SIZES)))
class Gromacs_HECBioSim(rfm.RunOnlyRegressionTest, ScalingTest):
""" Run HECBioSim Gromacs benchmarks.
Runs for environments named "gromacs" only.
Expand All @@ -26,26 +26,19 @@ class Gromacs_HECBioSim(rfm.RunOnlyRegressionTest, CachedRunTest):
Also handles download/unpacking benchmark files.
Set `self.use_cache=True` to re-run results processing without actually running
gromacs again, e.g. during debugging. See `reframe_extras.CachedRunTest` for details.
Args:
casename: str, directory name from the HECBioSim benchmark download, one of:
1400k-atoms 20k-atoms 3000k-atoms 465k-atoms 61k-atoms
num_nodes: int, number of nodes to run on
test_size: Number of nodes to use, possibly as fraction of partition size. See `modules.reframe_extras.ScalingTest.partition_fraction`.
"""
def __init__(self, casename, num_nodes):
def __init__(self, casename, test_size):

self.name = 'Gromacs_%s_%i' % (casename.split('-')[0], num_nodes) # e.g. Gromacs_1400k_2
self.use_cache = False # set to True to debug outputs using cached results
self.name = 'Gromacs_%s_%s' % (casename.split('-')[0], test_size) # e.g. Gromacs_1400k_-2

self.casename = casename
self.num_nodes = num_nodes

# these are the ones reframe uses:
self.num_tasks_per_node = Scheduler_Info().pcores_per_node
self.num_tasks = self.num_nodes * self.num_tasks_per_node
self.tags = {'num_procs=%i' % self.num_tasks, 'num_nodes=%i' % self.num_nodes, casename}
self.partition_fraction = test_size
self.node_fraction = 1.0 # use all physical cores
self.tags = {'test_size=%s' % test_size, casename}

self.logfile = casename + '.log'

Expand All @@ -55,7 +48,7 @@ def __init__(self, casename, num_nodes):
# created by download_benchmarks():
self.sourcesdir = os.path.join(self.prefix, 'downloads', 'gromacs', self.casename)

self.pre_run = ['time \\']
self.prerun_cmds = ['time \\']
self.executable = 'gmx_mpi'
self.executable_opts = ['mdrun', '-s', 'benchmark.tpr', '-g', self.logfile, '-noconfout'] #, '-ntomp', '1'] add this to turn off threading
self.exclusive_access = True
Expand Down Expand Up @@ -91,7 +84,7 @@ def __init__(self, casename, num_nodes):
# Finished mdrun on rank 0 Wed Jun 3 17:18:58 2020
#
# <EOF>

@rfm.run_before('run')
def download_benchmarks(self, ):
""" Download & unpack HECBioSim Gromacs benchmarks
Expand Down
55 changes: 55 additions & 0 deletions modules/reframe_extras.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,60 @@
import os, shutil, subprocess, shlex, subprocess
from pprint import pprint

class ScalingTest(rfm.RegressionTest):
""" Mixin to specify the number of nodes and processes-per-node to use relative to current partition resources.
Classes deriving from this must set the following to a number:
- `self.partition_fraction`
- `self.node_fraction`
If +ve, these give a factor which defines respectively:
- The number of nodes to use, as the number of nodes in the current scheduler partition * this factor
- The number of processes to use per node, as the number of physical cores * this factor
If -ve, they must be an integer in which case they define the absolute number of nodes or processes to use respectively.
Note that the current scheduler partition is affected by the (reframe) partition's `access` property. The following `sbatch` directives are
taken into account:
- `--partition`: use a non-default Slurm partition.
- `--exclude`: exclude specific nodes from consideration, i.e. a `partition_fraction` of 1.0 will not use these nodes.
The following tags are set:
- `num_nodes`: the actual number of nodes used.
- `num_procs`: the actual number of MPI tasks used.
"""

@rfm.run_before('run')
def set_nodes(self):

scheduler_partition = Scheduler_Info(self.current_partition)

# calculate number of nodes to use:
if not hasattr(self, 'partition_fraction'):
raise NameError('test classes derived from %r must define self.partition_fraction' % type(self))
if self.partition_fraction < 0:
if not isinstance(self.partition_fraction, int):
raise TypeError('invalid self.partition_fraction of %r : -ve values should specify an integer number of nodes' % self.test_size)
self.num_nodes = -1 * self.partition_fraction
else:
self.num_nodes = int(scheduler_partition.num_nodes * self.partition_fraction)

# calculate number of tasks per node:
if not hasattr(self, 'node_fraction'):
raise NameError('test classes derived from %r must define self.node_fraction' % type(self))
if self.node_fraction < 0:
if not isinstance(self.node_fraction, int):
raise TypeError('invalid self.node_fraction of %r : -ve values should specify an integer number of cores' % self.test_size)
self.num_tasks_per_node = -1 * self.node_fraction # reframe parameter
else:
self.num_tasks_per_node = int(scheduler_partition.pcores_per_node * self.node_fraction) # reframe parameter

self.num_tasks = self.num_nodes * self.num_tasks_per_node # reframe parameter

# set tags:
self.tags |= {'num_procs=%i' % self.num_tasks, 'num_nodes=%i' % self.num_nodes}

class CachedRunTest(rfm.RegressionTest):
""" Mixin. TODO: document properly.
Expand Down Expand Up @@ -162,6 +216,7 @@ def __init__(self, rfm_partition=None, exclude_states=None, only_states=None):
- `--partition`
- `--exclude`
"""

# TODO: handle scheduler not being slurm!
slurm_partition_name = None
slurm_excluded_nodes = []
Expand Down
15 changes: 8 additions & 7 deletions reframe_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
'access': [
'--partition=cclake',
'--account=support-cpu',
'--exclude=cpu-p-[57-672]', # only use one rack's-worth of nodes at present
'--exclude=cpu-p-[1-56,113-672]', # only use one rack's-worth of nodes at present: use rack 2=57-112 as this is all ok
'--time=1:00:00',
],
'max_jobs': 20,
}
Expand Down Expand Up @@ -47,7 +48,7 @@
**{
'name': 'cclake-ib-gcc9-ompi3-ucx',
'descr': '100Gb Infiniband using gcc 9.1.0 and openmpi 3.1.6 with UCX',
'environs': ['sysinfo', 'imb'],
'environs': ['sysinfo', 'imb', 'gromacs'],
'variables': [
['SLURM_MPI_TYPE', 'pmix_v3'], # available for ompi3+
['UCX_NET_DEVICES', 'mlx5_0:1'], # only use IB
Expand All @@ -59,7 +60,7 @@
**{
'name': 'cclake-roce-gcc9-ompi3-ucx',
'descr': '50Gb Infiniband using gcc 9.1.0 and openmpi 3.1.6 with UCX',
'environs': ['sysinfo', 'imb'],
'environs': ['sysinfo', 'imb', 'gromacs'],
'variables': [
['SLURM_MPI_TYPE', 'pmix_v3'], # available for ompi3+
['UCX_NET_DEVICES', 'mlx5_1:1'], # only use RoCE
Expand Down Expand Up @@ -140,7 +141,7 @@
'environments': [
{
'name': 'imb', # a non-targeted environment seems to be necessary for reframe to load the config
# will also work for csd3:cclake-ib-icc19-impi19-ucx as the partition module makes this available
# will also work for csd3:cclake-{ib,roce}-icc19-impi19-ucx as the partition's module makes this available
},
{
'name': 'imb',
Expand Down Expand Up @@ -169,7 +170,7 @@
},
{
'name': 'imb',
'target_systems': ['csd3:cclake-ib-gcc9-ompi3-ucx'],
'target_systems': ['csd3:cclake-ib-gcc9-ompi3-ucx', 'csd3:cclake-roce-gcc9-ompi3-ucx'],
'modules': ['intel-mpi-benchmarks-2019.6-gcc-9.1.0-5tbknir']
},
{
Expand All @@ -182,8 +183,8 @@
},
{
'name': 'gromacs',
'target_systems': ['arcus:ib-gcc9-openmpi4-ucx', 'arcus:roce-gcc9-openmpi4-ucx'],
'modules': ['gromacs/2016.6-5ltvgvk']
'target_systems': ['csd3:cclake-ib-gcc9-ompi3-ucx', 'csd3:cclake-roce-gcc9-ompi3-ucx'],
'modules': ['gromacs-2016.6-gcc-9.1.0-kgomb67']
},
{
'name': 'omb',
Expand Down

0 comments on commit e21a609

Please sign in to comment.