From 2601af9362f506066734e39885c1b3cd5ce8eb86 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Tue, 24 Sep 2024 15:56:30 +0200 Subject: [PATCH 1/7] First attempt to test squeue scheduler Signed-off-by: Theofilos Manitaras --- .github/slurm-cluster/docker-compose.yml | 113 +++++++++++++ .github/slurm-cluster/reframe/Dockerfile | 42 +++++ .github/slurm-cluster/reframe/cgroup.conf | 6 + .../reframe/docker-entrypoint.sh | 20 +++ .github/slurm-cluster/reframe/slurm.conf | 153 ++++++++++++++++++ .github/workflows/test-schedulers.yaml | 21 +++ 6 files changed, 355 insertions(+) create mode 100644 .github/slurm-cluster/docker-compose.yml create mode 100644 .github/slurm-cluster/reframe/Dockerfile create mode 100644 .github/slurm-cluster/reframe/cgroup.conf create mode 100755 .github/slurm-cluster/reframe/docker-entrypoint.sh create mode 100644 .github/slurm-cluster/reframe/slurm.conf create mode 100644 .github/workflows/test-schedulers.yaml diff --git a/.github/slurm-cluster/docker-compose.yml b/.github/slurm-cluster/docker-compose.yml new file mode 100644 index 0000000000..8290a74ec9 --- /dev/null +++ b/.github/slurm-cluster/docker-compose.yml @@ -0,0 +1,113 @@ +services: + munge-key-generator: + image: ghcr.io/reframe-hpc/munge-ubuntu:20.04 + hostname: munge-host + healthcheck: + test: ["CMD-SHELL", "test -f /scratch/munge.key"] + interval: 10s + timeout: 10s + retries: 5 + volumes: + - shared-scratch:/scratch + + frontend: + image: slurm-reframe + container_name: frontend + build: + dockerfile: .github/slurm-cluster/reframe/Dockerfile + context: ../../ + hostname: login + user: admin + init: True + volumes: + - shared-home:/home/admin:rw + - shared-scratch:/scratch:rw + links: + - slurm-master + depends_on: + munge-key-generator: + condition: service_healthy + slurm-master: + condition: service_started + node0: + condition: service_started + node1: + condition: service_started + node2: + condition: service_started + environment: + - SLURM_CPUS_ON_NODE=1 + + slurm-master: + image: ghcr.io/reframe-hpc/slurm-master-ubuntu:20.04 + hostname: slurm-master + user: admin + volumes: + - shared-home:/home/admin + - shared-scratch:/scratch:rw + depends_on: + munge-key-generator: + condition: service_healthy + environment: + - SLURM_CPUS_ON_NODE=1 + + node0: + image: ghcr.io/reframe-hpc/slurm-node-ubuntu:20.04 + hostname: nid00 + container_name: slurm-node0 + user: admin + volumes: + - shared-home:/home/admin + - shared-scratch:/scratch:rw + environment: + - SLURM_NODENAME=nid00 + - SLURM_CPUS_ON_NODE=1 + depends_on: + munge-key-generator: + condition: service_healthy + slurm-master: + condition: service_started + links: + - slurm-master + + node1: + image: ghcr.io/reframe-hpc/slurm-node-ubuntu:20.04 + hostname: nid01 + container_name: slurm-node1 + user: admin + volumes: + - shared-home:/home/admin + - shared-scratch:/scratch:rw + environment: + - SLURM_NODENAME=nid01 + - SLURM_CPUS_ON_NODE=1 + depends_on: + munge-key-generator: + condition: service_healthy + slurm-master: + condition: service_started + links: + - slurm-master + + node2: + image: ghcr.io/reframe-hpc/slurm-node-ubuntu:20.04 + hostname: nid02 + container_name: slurm-node2 + user: admin + volumes: + - shared-home:/home/admin + - shared-scratch:/scratch:rw + environment: + - SLURM_NODENAME=nid02 + - SLURM_CPUS_ON_NODE=1 + depends_on: + munge-key-generator: + condition: service_healthy + slurm-master: + condition: service_started + links: + - slurm-master + +volumes: + shared-home: + shared-scratch: diff --git a/.github/slurm-cluster/reframe/Dockerfile b/.github/slurm-cluster/reframe/Dockerfile new file mode 100644 index 0000000000..53fc075d44 --- /dev/null +++ b/.github/slurm-cluster/reframe/Dockerfile @@ -0,0 +1,42 @@ +FROM ubuntu:20.04 + +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt update -y && \ + apt install -y \ + build-essential \ + clang jq libomp-dev tree vim \ + git \ + mariadb-client \ + munge \ + slurm-client \ + slurm-wlm-torque \ + sudo \ + python3 \ + python3-pip \ + wget \ + curl \ + mpich \ + libmpich-dev && \ + rm -rf /var/lib/apt/lists/* + +RUN useradd -m admin -s /usr/bin/bash -d /home/admin && \ + echo "admin:admin" | chpasswd && adduser admin sudo && \ + echo "admin ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + +COPY .github/slurm-cluster/reframe/slurm.conf /etc/slurm-llnl/ +COPY .github/slurm-cluster/reframe/cgroup.conf /etc/slurm-llnl/ +COPY .github/slurm-cluster/reframe/docker-entrypoint.sh /etc/slurm-llnl/ +COPY . /usr/local/share/reframe + +RUN mkdir /scratch && \ + chown -R admin:admin /scratch + +RUN chmod +rx /etc/slurm-llnl/docker-entrypoint.sh + +WORKDIR /home/admin + +ENV USER admin +ENV SHELL bash + +ENTRYPOINT ["/etc/slurm-llnl/docker-entrypoint.sh"] diff --git a/.github/slurm-cluster/reframe/cgroup.conf b/.github/slurm-cluster/reframe/cgroup.conf new file mode 100644 index 0000000000..0b634635a4 --- /dev/null +++ b/.github/slurm-cluster/reframe/cgroup.conf @@ -0,0 +1,6 @@ +CgroupAutomount=yes +CgroupReleaseAgentDir="/etc/slurm/cgroup" +ConstrainCores=yes +ConstrainDevices=yes +ConstrainRAMSpace=yes + diff --git a/.github/slurm-cluster/reframe/docker-entrypoint.sh b/.github/slurm-cluster/reframe/docker-entrypoint.sh new file mode 100755 index 0000000000..48f9a9452c --- /dev/null +++ b/.github/slurm-cluster/reframe/docker-entrypoint.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +trap exit 0 INT + +while [ ! -f /scratch/munge.key ] +do + sleep 1 +done + +sudo cp /scratch/munge.key /etc/munge/munge.key +sudo service munge start +sudo sed -i "s/REPLACE_IT/CPUs=${SLURM_CPUS_ON_NODE}/g" /etc/slurm-llnl/slurm.conf + +# Needs to be copied in the shared home directory +cp -r /usr/local/share/reframe . +cd reframe +./bootstrap.sh + +tempdir=$(mktemp -d -p /scratch) +TMPDIR=$tempdir ./test_reframe.py -v --rfm-user-config=examples/tutorial/config/cluster.py diff --git a/.github/slurm-cluster/reframe/slurm.conf b/.github/slurm-cluster/reframe/slurm.conf new file mode 100644 index 0000000000..9e2c782156 --- /dev/null +++ b/.github/slurm-cluster/reframe/slurm.conf @@ -0,0 +1,153 @@ +# slurm.conf file generated by configurator.html. +# Put this file on all nodes of your cluster. +# See the slurm.conf man page for more information. +# +SlurmctldHost=slurm-master +# +#DisableRootJobs=NO +#EnforcePartLimits=NO +#Epilog= +#EpilogSlurmctld= +#FirstJobId=1 +#MaxJobId=999999 +#GresTypes= +#GroupUpdateForce=0 +#GroupUpdateTime=600 +#JobFileAppend=0 +#JobRequeue=1 +#JobSubmitPlugins=1 +#KillOnBadExit=0 +#LaunchType=launch/slurm +#Licenses=foo*4,bar +#MailProg=/bin/mail +#MaxJobCount=5000 +#MaxStepCount=40000 +#MaxTasksPerNode=128 +MpiDefault=pmi2 +#MpiParams=ports=#-# +#PluginDir= +#PlugStackConfig= +#PrivateData=jobs +#ProctrackType=proctrack/cgroup +ProctrackType=proctrack/linuxproc +#Prolog= +#PrologFlags= +#PrologSlurmctld= +#PropagatePrioProcess=0 +#PropagateResourceLimits= +#PropagateResourceLimitsExcept= +#RebootProgram= +ReturnToService=1 +#SallocDefaultCommand= +SlurmctldPidFile=/var/run/slurmctld.pid +SlurmctldPort=6817 +SlurmdPidFile=/var/run/slurmd.pid +SlurmdPort=6818 +SlurmdSpoolDir=/var/spool/slurmd +SlurmUser=root +#SlurmdUser=root +#SrunEpilog= +#SrunProlog= +StateSaveLocation=/var/spool +SwitchType=switch/none +#TaskEpilog= +TaskPlugin=task/affinity +TaskPluginParam=Sched +#TaskProlog= +#TopologyPlugin=topology/tree +#TmpFS=/tmp +#TrackWCKey=no +#TreeWidth= +#UnkillableStepProgram= +#UsePAM=0 +# +# +# TIMERS +#BatchStartTimeout=10 +#CompleteWait=0 +#EpilogMsgTime=2000 +#GetEnvTimeout=2 +#HealthCheckInterval=0 +#HealthCheckProgram= +InactiveLimit=0 +KillWait=30 +#MessageTimeout=10 +#ResvOverRun=0 +MinJobAge=300 +#OverTimeLimit=0 +SlurmctldTimeout=120 +SlurmdTimeout=300 +#UnkillableStepTimeout=60 +#VSizeFactor=0 +Waittime=0 +# +# +# SCHEDULING +#DefMemPerCPU=0 +#MaxMemPerCPU=0 +#SchedulerTimeSlice=30 +SchedulerType=sched/backfill +SelectType=select/cons_res +SelectTypeParameters=CR_Core +# +# +# JOB PRIORITY +#PriorityFlags= +#PriorityType=priority/basic +#PriorityDecayHalfLife= +#PriorityCalcPeriod= +#PriorityFavorSmall= +#PriorityMaxAge= +#PriorityUsageResetPeriod= +#PriorityWeightAge= +#PriorityWeightFairshare= +#PriorityWeightJobSize= +#PriorityWeightPartition= +#PriorityWeightQOS= +# +# +# LOGGING AND ACCOUNTING +#AccountingStorageEnforce=0 +#AccountingStorageHost= +#AccountingStorageLoc= +#AccountingStoragePass= +#AccountingStoragePort= +AccountingStorageType=accounting_storage/none +#AccountingStorageUser= +AccountingStoreJobComment=YES +ClusterName=cluster +#DebugFlags= +#JobCompHost= +#JobCompLoc= +#JobCompPass= +#JobCompPort= +JobCompType=jobcomp/none +#JobCompUser= +#JobContainerType=job_container/none +JobAcctGatherFrequency=30 +JobAcctGatherType=jobacct_gather/none +SlurmctldDebug=error +SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log +SlurmdDebug=error +SlurmdLogFile=/var/log/slurm-llnl/slurmd.log +#SlurmSchedLogFile= +#SlurmSchedLogLevel= +# +# +# POWER SAVE SUPPORT FOR IDLE NODES (optional) +#SuspendProgram= +#ResumeProgram= +#SuspendTimeout= +#ResumeTimeout= +#ResumeRate= +#SuspendExcNodes= +#SuspendExcParts= +#SuspendRate= +#SuspendTime= +# +# +# COMPUTE NODES +# +NodeName=nid0[0-2] REPLACE_IT State=UNKNOWN +PartitionName=all Nodes=ALL Default=YES MaxTime=INFINITE State=UP + diff --git a/.github/workflows/test-schedulers.yaml b/.github/workflows/test-schedulers.yaml new file mode 100644 index 0000000000..585e8f2be9 --- /dev/null +++ b/.github/workflows/test-schedulers.yaml @@ -0,0 +1,21 @@ +name: Test Schedulers +on: + pull_request: [] + +jobs: + squeue-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build Images + run: | + docker compose -f ./github/slurm-cluster/docker-compose.yaml build + - name: Run Unittests + run: | + docker compose -f ./github/slurm-cluster/docker-compose.yml up --abort-on-container-exit --exit-code-from frontend From 0007b2dcd2b5a1e9fa249851df03be0c4736251d Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Tue, 24 Sep 2024 15:58:58 +0200 Subject: [PATCH 2/7] Fix path to yaml file Signed-off-by: Theofilos Manitaras --- .github/workflows/test-schedulers.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-schedulers.yaml b/.github/workflows/test-schedulers.yaml index 585e8f2be9..5038d48582 100644 --- a/.github/workflows/test-schedulers.yaml +++ b/.github/workflows/test-schedulers.yaml @@ -15,7 +15,7 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Build Images run: | - docker compose -f ./github/slurm-cluster/docker-compose.yaml build + docker compose -f .github/slurm-cluster/docker-compose.yaml build - name: Run Unittests run: | - docker compose -f ./github/slurm-cluster/docker-compose.yml up --abort-on-container-exit --exit-code-from frontend + docker compose -f .github/slurm-cluster/docker-compose.yml up --abort-on-container-exit --exit-code-from frontend From ff2eea3fb35154fe71126da2bb7dd00065a7392b Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Tue, 24 Sep 2024 16:03:47 +0200 Subject: [PATCH 3/7] Fix .yaml file suffix --- .github/workflows/test-schedulers.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-schedulers.yaml b/.github/workflows/test-schedulers.yaml index 5038d48582..4884073a17 100644 --- a/.github/workflows/test-schedulers.yaml +++ b/.github/workflows/test-schedulers.yaml @@ -15,7 +15,7 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Build Images run: | - docker compose -f .github/slurm-cluster/docker-compose.yaml build + docker compose -f .github/slurm-cluster/docker-compose.yml build - name: Run Unittests run: | docker compose -f .github/slurm-cluster/docker-compose.yml up --abort-on-container-exit --exit-code-from frontend From d9fd79b5d077dead08aea8c9bcb86c185d2af81c Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Wed, 25 Sep 2024 11:10:16 +0200 Subject: [PATCH 4/7] Run unittests with multiple schedulers Signed-off-by: Theofilos Manitaras --- .../docker-compose.yml | 3 +- .../reframe/Dockerfile | 6 +- .../reframe/cgroup.conf | 0 .../reframe/docker-entrypoint.sh | 4 +- .../reframe/slurm.conf | 0 .github/workflows/test-schedulers.yaml | 11 ++-- config/ci-cluster.py | 66 +++++++++++++++++++ 7 files changed, 81 insertions(+), 9 deletions(-) rename .github/{slurm-cluster => pseudo-cluster}/docker-compose.yml (96%) rename .github/{slurm-cluster => pseudo-cluster}/reframe/Dockerfile (79%) rename .github/{slurm-cluster => pseudo-cluster}/reframe/cgroup.conf (100%) rename .github/{slurm-cluster => pseudo-cluster}/reframe/docker-entrypoint.sh (72%) rename .github/{slurm-cluster => pseudo-cluster}/reframe/slurm.conf (100%) create mode 100644 config/ci-cluster.py diff --git a/.github/slurm-cluster/docker-compose.yml b/.github/pseudo-cluster/docker-compose.yml similarity index 96% rename from .github/slurm-cluster/docker-compose.yml rename to .github/pseudo-cluster/docker-compose.yml index 8290a74ec9..a6adb767f4 100644 --- a/.github/slurm-cluster/docker-compose.yml +++ b/.github/pseudo-cluster/docker-compose.yml @@ -14,7 +14,7 @@ services: image: slurm-reframe container_name: frontend build: - dockerfile: .github/slurm-cluster/reframe/Dockerfile + dockerfile: .github/pseudo-cluster/reframe/Dockerfile context: ../../ hostname: login user: admin @@ -37,6 +37,7 @@ services: condition: service_started environment: - SLURM_CPUS_ON_NODE=1 + - BACKEND=${BACKEND:-squeue} slurm-master: image: ghcr.io/reframe-hpc/slurm-master-ubuntu:20.04 diff --git a/.github/slurm-cluster/reframe/Dockerfile b/.github/pseudo-cluster/reframe/Dockerfile similarity index 79% rename from .github/slurm-cluster/reframe/Dockerfile rename to .github/pseudo-cluster/reframe/Dockerfile index 53fc075d44..afeac49f98 100644 --- a/.github/slurm-cluster/reframe/Dockerfile +++ b/.github/pseudo-cluster/reframe/Dockerfile @@ -24,9 +24,9 @@ RUN useradd -m admin -s /usr/bin/bash -d /home/admin && \ echo "admin:admin" | chpasswd && adduser admin sudo && \ echo "admin ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers -COPY .github/slurm-cluster/reframe/slurm.conf /etc/slurm-llnl/ -COPY .github/slurm-cluster/reframe/cgroup.conf /etc/slurm-llnl/ -COPY .github/slurm-cluster/reframe/docker-entrypoint.sh /etc/slurm-llnl/ +COPY .github/pseudo-cluster/reframe/slurm.conf /etc/slurm-llnl/ +COPY .github/pseudo-cluster/reframe/cgroup.conf /etc/slurm-llnl/ +COPY .github/pseudo-cluster/reframe/docker-entrypoint.sh /etc/slurm-llnl/ COPY . /usr/local/share/reframe RUN mkdir /scratch && \ diff --git a/.github/slurm-cluster/reframe/cgroup.conf b/.github/pseudo-cluster/reframe/cgroup.conf similarity index 100% rename from .github/slurm-cluster/reframe/cgroup.conf rename to .github/pseudo-cluster/reframe/cgroup.conf diff --git a/.github/slurm-cluster/reframe/docker-entrypoint.sh b/.github/pseudo-cluster/reframe/docker-entrypoint.sh similarity index 72% rename from .github/slurm-cluster/reframe/docker-entrypoint.sh rename to .github/pseudo-cluster/reframe/docker-entrypoint.sh index 48f9a9452c..9509ccb508 100755 --- a/.github/slurm-cluster/reframe/docker-entrypoint.sh +++ b/.github/pseudo-cluster/reframe/docker-entrypoint.sh @@ -17,4 +17,6 @@ cd reframe ./bootstrap.sh tempdir=$(mktemp -d -p /scratch) -TMPDIR=$tempdir ./test_reframe.py -v --rfm-user-config=examples/tutorial/config/cluster.py +TMPDIR=$tempdir ./test_reframe.py -v \ + --rfm-user-config=config/ci-cluster.py \ + --rfm-user-system=pseudo-cluster:compute-${BACKEND:-squeue} diff --git a/.github/slurm-cluster/reframe/slurm.conf b/.github/pseudo-cluster/reframe/slurm.conf similarity index 100% rename from .github/slurm-cluster/reframe/slurm.conf rename to .github/pseudo-cluster/reframe/slurm.conf diff --git a/.github/workflows/test-schedulers.yaml b/.github/workflows/test-schedulers.yaml index 4884073a17..5b35709f97 100644 --- a/.github/workflows/test-schedulers.yaml +++ b/.github/workflows/test-schedulers.yaml @@ -3,8 +3,11 @@ on: pull_request: [] jobs: - squeue-test: + scheduler-test: runs-on: ubuntu-latest + strategy: + matrix: + scheduler: ['squeue', 'torque'] steps: - uses: actions/checkout@v4 - name: Login to GitHub Container Registry @@ -15,7 +18,7 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Build Images run: | - docker compose -f .github/slurm-cluster/docker-compose.yml build - - name: Run Unittests + docker compose -f .github/pseudo-cluster/docker-compose.yml build + - name: Run Unittests with ${{ matrix.scheduler }} sceduler run: | - docker compose -f .github/slurm-cluster/docker-compose.yml up --abort-on-container-exit --exit-code-from frontend + BACKEND=${{ matrix.scheduler }}docker compose -f .github/pseudo-cluster/docker-compose.yml up --abort-on-container-exit --exit-code-from frontend diff --git a/config/ci-cluster.py b/config/ci-cluster.py new file mode 100644 index 0000000000..32487114ad --- /dev/null +++ b/config/ci-cluster.py @@ -0,0 +1,66 @@ +# Copyright 2016-2024 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +site_configuration = { + 'systems': [ + { + 'name': 'pseudo-cluster', + 'descr': 'CI Slurm-based pseudo cluster', + 'hostnames': ['login'], + 'partitions': [ + { + 'name': 'login', + 'descr': 'Login nodes', + 'scheduler': 'local', + 'launcher': 'local', + 'environs': ['gnu', 'clang'] + }, + { + 'name': 'compute-squeue', + 'descr': 'Squeue compute nodes', + 'scheduler': 'squeue', + 'launcher': 'srun', + 'access': ['-p all'], + 'environs': ['gnu', 'clang'] + }, + { + 'name': 'compute-torque', + 'descr': 'Torque compute nodes', + 'scheduler': 'squeue', + 'launcher': 'mpiexec', + 'access': ['-p all'], + 'environs': ['gnu', 'clang'] + } + + ] + }, + ], + 'environments': [ + { + 'name': 'baseline', + 'features': ['stream'] + }, + { + 'name': 'gnu', + 'cc': 'gcc', + 'cxx': 'g++', + 'features': ['openmp'], + 'extras': {'omp_flag': '-fopenmp'} + }, + { + 'name': 'clang', + 'cc': 'clang', + 'cxx': 'clang++', + 'features': ['openmp'], + 'extras': {'omp_flag': '-fopenmp'} + } + ], + 'modes': [ + { + 'name': 'singlethread', + 'options': ['-E num_threads==1'] + } + ] +} From 297d2e679622d30fd2cc57e4b1b72292dba0aec0 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Wed, 25 Sep 2024 11:13:50 +0200 Subject: [PATCH 5/7] Add missing space Signed-off-by: Theofilos Manitaras --- .github/workflows/test-schedulers.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-schedulers.yaml b/.github/workflows/test-schedulers.yaml index 5b35709f97..e496179547 100644 --- a/.github/workflows/test-schedulers.yaml +++ b/.github/workflows/test-schedulers.yaml @@ -21,4 +21,4 @@ jobs: docker compose -f .github/pseudo-cluster/docker-compose.yml build - name: Run Unittests with ${{ matrix.scheduler }} sceduler run: | - BACKEND=${{ matrix.scheduler }}docker compose -f .github/pseudo-cluster/docker-compose.yml up --abort-on-container-exit --exit-code-from frontend + BACKEND=${{ matrix.scheduler }} docker compose -f .github/pseudo-cluster/docker-compose.yml up --abort-on-container-exit --exit-code-from frontend From 95baa8a37f2a54416ae48229092092e135dc0c8c Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Wed, 25 Sep 2024 11:39:41 +0200 Subject: [PATCH 6/7] Add pbs scheduler testing Signed-off-by: Theofilos Manitaras --- .github/pseudo-cluster/reframe/docker-entrypoint.sh | 2 ++ .github/workflows/test-schedulers.yaml | 2 +- config/ci-cluster.py | 11 +++++++++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/.github/pseudo-cluster/reframe/docker-entrypoint.sh b/.github/pseudo-cluster/reframe/docker-entrypoint.sh index 9509ccb508..c49b721388 100755 --- a/.github/pseudo-cluster/reframe/docker-entrypoint.sh +++ b/.github/pseudo-cluster/reframe/docker-entrypoint.sh @@ -16,6 +16,8 @@ cp -r /usr/local/share/reframe . cd reframe ./bootstrap.sh +echo "Running unittests with backend scheduler: ${BACKEND}" + tempdir=$(mktemp -d -p /scratch) TMPDIR=$tempdir ./test_reframe.py -v \ --rfm-user-config=config/ci-cluster.py \ diff --git a/.github/workflows/test-schedulers.yaml b/.github/workflows/test-schedulers.yaml index e496179547..13613eb5b0 100644 --- a/.github/workflows/test-schedulers.yaml +++ b/.github/workflows/test-schedulers.yaml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - scheduler: ['squeue', 'torque'] + scheduler: ['pbs', 'squeue', 'torque'] steps: - uses: actions/checkout@v4 - name: Login to GitHub Container Registry diff --git a/config/ci-cluster.py b/config/ci-cluster.py index 32487114ad..3d30203fbd 100644 --- a/config/ci-cluster.py +++ b/config/ci-cluster.py @@ -28,12 +28,19 @@ { 'name': 'compute-torque', 'descr': 'Torque compute nodes', - 'scheduler': 'squeue', + 'scheduler': 'torque', + 'launcher': 'mpiexec', + 'access': ['-p all'], + 'environs': ['gnu', 'clang'] + }, + { + 'name': 'compute-pbs', + 'descr': 'PBS compute nodes', + 'scheduler': 'pbs', 'launcher': 'mpiexec', 'access': ['-p all'], 'environs': ['gnu', 'clang'] } - ] }, ], From 45541fb2da9fcc66e18b077f6e1a612743376835 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Thu, 26 Sep 2024 12:02:14 +0200 Subject: [PATCH 7/7] Address PR comments Signed-off-by: Theofilos Manitaras --- .github/pseudo-cluster/reframe/cgroup.conf | 7 +- .../reframe/docker-entrypoint.sh | 2 +- .github/pseudo-cluster/reframe/slurm.conf | 154 +--------------- .github/workflows/test-schedulers.yaml | 2 +- Jenkinsfile | 169 ------------------ ci-scripts/ci-runner.bash | 4 +- {config => ci-scripts/configs}/ci-cluster.py | 0 {config => ci-scripts/configs}/cscs-ci.py | 0 8 files changed, 6 insertions(+), 332 deletions(-) mode change 100644 => 120000 .github/pseudo-cluster/reframe/cgroup.conf mode change 100644 => 120000 .github/pseudo-cluster/reframe/slurm.conf delete mode 100644 Jenkinsfile rename {config => ci-scripts/configs}/ci-cluster.py (100%) rename {config => ci-scripts/configs}/cscs-ci.py (100%) diff --git a/.github/pseudo-cluster/reframe/cgroup.conf b/.github/pseudo-cluster/reframe/cgroup.conf deleted file mode 100644 index 0b634635a4..0000000000 --- a/.github/pseudo-cluster/reframe/cgroup.conf +++ /dev/null @@ -1,6 +0,0 @@ -CgroupAutomount=yes -CgroupReleaseAgentDir="/etc/slurm/cgroup" -ConstrainCores=yes -ConstrainDevices=yes -ConstrainRAMSpace=yes - diff --git a/.github/pseudo-cluster/reframe/cgroup.conf b/.github/pseudo-cluster/reframe/cgroup.conf new file mode 120000 index 0000000000..c99f23956f --- /dev/null +++ b/.github/pseudo-cluster/reframe/cgroup.conf @@ -0,0 +1 @@ +../../../examples/tutorial/dockerfiles/slurm-cluster/reframe/cgroup.conf \ No newline at end of file diff --git a/.github/pseudo-cluster/reframe/docker-entrypoint.sh b/.github/pseudo-cluster/reframe/docker-entrypoint.sh index c49b721388..665f23ff8d 100755 --- a/.github/pseudo-cluster/reframe/docker-entrypoint.sh +++ b/.github/pseudo-cluster/reframe/docker-entrypoint.sh @@ -20,5 +20,5 @@ echo "Running unittests with backend scheduler: ${BACKEND}" tempdir=$(mktemp -d -p /scratch) TMPDIR=$tempdir ./test_reframe.py -v \ - --rfm-user-config=config/ci-cluster.py \ + --rfm-user-config=ci-scripts/configs/ci-cluster.py \ --rfm-user-system=pseudo-cluster:compute-${BACKEND:-squeue} diff --git a/.github/pseudo-cluster/reframe/slurm.conf b/.github/pseudo-cluster/reframe/slurm.conf deleted file mode 100644 index 9e2c782156..0000000000 --- a/.github/pseudo-cluster/reframe/slurm.conf +++ /dev/null @@ -1,153 +0,0 @@ -# slurm.conf file generated by configurator.html. -# Put this file on all nodes of your cluster. -# See the slurm.conf man page for more information. -# -SlurmctldHost=slurm-master -# -#DisableRootJobs=NO -#EnforcePartLimits=NO -#Epilog= -#EpilogSlurmctld= -#FirstJobId=1 -#MaxJobId=999999 -#GresTypes= -#GroupUpdateForce=0 -#GroupUpdateTime=600 -#JobFileAppend=0 -#JobRequeue=1 -#JobSubmitPlugins=1 -#KillOnBadExit=0 -#LaunchType=launch/slurm -#Licenses=foo*4,bar -#MailProg=/bin/mail -#MaxJobCount=5000 -#MaxStepCount=40000 -#MaxTasksPerNode=128 -MpiDefault=pmi2 -#MpiParams=ports=#-# -#PluginDir= -#PlugStackConfig= -#PrivateData=jobs -#ProctrackType=proctrack/cgroup -ProctrackType=proctrack/linuxproc -#Prolog= -#PrologFlags= -#PrologSlurmctld= -#PropagatePrioProcess=0 -#PropagateResourceLimits= -#PropagateResourceLimitsExcept= -#RebootProgram= -ReturnToService=1 -#SallocDefaultCommand= -SlurmctldPidFile=/var/run/slurmctld.pid -SlurmctldPort=6817 -SlurmdPidFile=/var/run/slurmd.pid -SlurmdPort=6818 -SlurmdSpoolDir=/var/spool/slurmd -SlurmUser=root -#SlurmdUser=root -#SrunEpilog= -#SrunProlog= -StateSaveLocation=/var/spool -SwitchType=switch/none -#TaskEpilog= -TaskPlugin=task/affinity -TaskPluginParam=Sched -#TaskProlog= -#TopologyPlugin=topology/tree -#TmpFS=/tmp -#TrackWCKey=no -#TreeWidth= -#UnkillableStepProgram= -#UsePAM=0 -# -# -# TIMERS -#BatchStartTimeout=10 -#CompleteWait=0 -#EpilogMsgTime=2000 -#GetEnvTimeout=2 -#HealthCheckInterval=0 -#HealthCheckProgram= -InactiveLimit=0 -KillWait=30 -#MessageTimeout=10 -#ResvOverRun=0 -MinJobAge=300 -#OverTimeLimit=0 -SlurmctldTimeout=120 -SlurmdTimeout=300 -#UnkillableStepTimeout=60 -#VSizeFactor=0 -Waittime=0 -# -# -# SCHEDULING -#DefMemPerCPU=0 -#MaxMemPerCPU=0 -#SchedulerTimeSlice=30 -SchedulerType=sched/backfill -SelectType=select/cons_res -SelectTypeParameters=CR_Core -# -# -# JOB PRIORITY -#PriorityFlags= -#PriorityType=priority/basic -#PriorityDecayHalfLife= -#PriorityCalcPeriod= -#PriorityFavorSmall= -#PriorityMaxAge= -#PriorityUsageResetPeriod= -#PriorityWeightAge= -#PriorityWeightFairshare= -#PriorityWeightJobSize= -#PriorityWeightPartition= -#PriorityWeightQOS= -# -# -# LOGGING AND ACCOUNTING -#AccountingStorageEnforce=0 -#AccountingStorageHost= -#AccountingStorageLoc= -#AccountingStoragePass= -#AccountingStoragePort= -AccountingStorageType=accounting_storage/none -#AccountingStorageUser= -AccountingStoreJobComment=YES -ClusterName=cluster -#DebugFlags= -#JobCompHost= -#JobCompLoc= -#JobCompPass= -#JobCompPort= -JobCompType=jobcomp/none -#JobCompUser= -#JobContainerType=job_container/none -JobAcctGatherFrequency=30 -JobAcctGatherType=jobacct_gather/none -SlurmctldDebug=error -SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log -SlurmdDebug=error -SlurmdLogFile=/var/log/slurm-llnl/slurmd.log -#SlurmSchedLogFile= -#SlurmSchedLogLevel= -# -# -# POWER SAVE SUPPORT FOR IDLE NODES (optional) -#SuspendProgram= -#ResumeProgram= -#SuspendTimeout= -#ResumeTimeout= -#ResumeRate= -#SuspendExcNodes= -#SuspendExcParts= -#SuspendRate= -#SuspendTime= -# -# -# COMPUTE NODES -# -NodeName=nid0[0-2] REPLACE_IT State=UNKNOWN -PartitionName=all Nodes=ALL Default=YES MaxTime=INFINITE State=UP - diff --git a/.github/pseudo-cluster/reframe/slurm.conf b/.github/pseudo-cluster/reframe/slurm.conf new file mode 120000 index 0000000000..39eb5d7b6f --- /dev/null +++ b/.github/pseudo-cluster/reframe/slurm.conf @@ -0,0 +1 @@ +../../../examples/tutorial/dockerfiles/slurm-cluster/reframe/slurm.conf \ No newline at end of file diff --git a/.github/workflows/test-schedulers.yaml b/.github/workflows/test-schedulers.yaml index 13613eb5b0..83285e0988 100644 --- a/.github/workflows/test-schedulers.yaml +++ b/.github/workflows/test-schedulers.yaml @@ -1,4 +1,4 @@ -name: Test Schedulers +name: ReFrame CI / Scheduler backend tests on: pull_request: [] diff --git a/Jenkinsfile b/Jenkinsfile deleted file mode 100644 index 1553a9b1b1..0000000000 --- a/Jenkinsfile +++ /dev/null @@ -1,169 +0,0 @@ -#!/usr/bin/env groovy - -def dirPrefix = 'reframe-ci' -def loginBash = '#!/bin/bash -l' -def bashScript = 'ci-scripts/ci-runner.bash' -def machinesList = params.machines.split() -def machinesToRun = machinesList -def runTests = true -def uniqueID - -stage('Initialization') { - node('master') { - catchError(stageResult: 'FAILURE') { - uniqueID = "${env.ghprbActualCommit[0..6]}-${env.BUILD_ID}" - echo 'Environment Variables:' - echo sh(script: 'env|sort', returnStdout: true) - - def githubComment = env.ghprbCommentBody - if (githubComment == 'null' || !githubComment.trim().startsWith('@jenkins-cscs')) { - machinesToRun = machinesList - currentBuild.result = 'SUCCESS' - return - } - - def splittedComment = githubComment.split() - if (splittedComment.size() < 3) { - println 'No machines were found. Aborting...' - currentBuild.result = 'ABORTED' - return - } - if (splittedComment[1] != 'retry') { - println "Invalid command ${splittedComment[1]}. Aborting..." - currentBuild.result = 'ABORTED' - return - } - if (splittedComment[2] == 'all') { - machinesToRun = machinesList - currentBuild.result = 'SUCCESS' - return - } - else if (splittedComment[2] == 'none') { - runTests = false - currentBuild.result = 'SUCCESS' - return - } - - machinesRequested = [] - for (i = 2; i < splittedComment.size(); i++) { - machinesRequested.add(splittedComment[i]) - } - - machinesToRun = machinesRequested.findAll({it in machinesList}) - if (!machinesToRun) { - println 'No machines were found. Aborting...' - currentBuild.result = 'ABORTED' - return - } - currentBuild.result = 'SUCCESS' - } - } -} - -if (!runTests) { - println "Won't execute any test (${currentBuild.result}). Exiting..." - return -} - -if (currentBuild.result != 'SUCCESS') { - println "Initialization failed (${currentBuild.result}). Exiting..." - return -} - -def builds = [:] -stage('Unittest') { - for (mach in machinesToRun) { - def machineName = mach - builds[machineName] = { - node(machineName) { - def scratch = sh(returnStdout: true, - script: """${loginBash} - echo \$SCRATCH""").trim() - def reframeDir = "${scratch}/${dirPrefix}-${machineName}-${uniqueID}" - dir(reframeDir) { - checkout scm - sh("""${loginBash} - bash ${reframeDir}/${bashScript} -f ${reframeDir} -i ''""") - } - } - } - } - - catchError(stageResult: 'FAILURE') { - parallel builds - } -} - -builds = [:] -stage('Tutorial Check') { - if (currentBuild.result != 'SUCCESS') { - println 'Not executing "Tutorial Check" Stage' - return - } - else { - catchError(stageResult: 'FAILURE') { - if (!('daint' in machinesToRun)) { - return - } - node('daint') { - def scratch = sh(returnStdout: true, - script: """${loginBash} - echo \$SCRATCH""").trim() - def reframeDir = "${scratch}/${dirPrefix}-daint-${uniqueID}" - dir(reframeDir) { - sh("""${loginBash} - bash ${reframeDir}/${bashScript} -f ${reframeDir} -i '' -t""") - } - } - } - } -} - -builds = [:] -stage('Cleanup') { - if (currentBuild.result != 'SUCCESS') { - println 'Not executing "Cleanup" Stage' - return - } - else { - for (mach in machinesToRun) { - def machineName = mach - builds[machineName] = { - node(machineName) { - def scratch = sh(returnStdout: true, - script: """$loginBash - echo \$SCRATCH""").trim() - def reframeDir = "${scratch}/${dirPrefix}-${machineName}-${uniqueID}" - sh("""${loginBash} - rm -rf ${reframeDir} - date""") - - } - } - } - catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { - parallel builds - } - } -} - -def staleCleanupInterval = 3 -builds = [:] -stage('Cleanup Stale') { - for (mach in machinesToRun) { - def machineName = mach - builds[machineName] = { - node(machineName) { - def scratch = sh(returnStdout: true, - script: """${loginBash} - echo \$SCRATCH""").trim() - sh("""${loginBash} - find ${scratch} -maxdepth 1 -name 'reframe-ci*' -ctime +${staleCleanupInterval} -type d -exec printf 'Removing %s\\n' {} + - find ${scratch} -maxdepth 1 -name 'reframe-ci*' -ctime +${staleCleanupInterval} -type d -exec rm -rf {} +""") - } - } - } - catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { - parallel builds - } -} diff --git a/ci-scripts/ci-runner.bash b/ci-scripts/ci-runner.bash index 50130481b9..08e0062c9d 100644 --- a/ci-scripts/ci-runner.bash +++ b/ci-scripts/ci-runner.bash @@ -170,7 +170,7 @@ else for backend in slurm pbs torque; do echo "[INFO] Running unit tests with ${backend}" TMPDIR=$tempdir checked_exec ./test_reframe.py ${parallel_opts} \ - --rfm-user-config=config/cscs-ci.py \ + --rfm-user-config=ci-scripts/configs/cscs-ci.py \ -W=error::reframe.core.warnings.ReframeDeprecationWarning \ --rfm-user-system=dom:${backend} -ra done @@ -178,7 +178,7 @@ else else echo "[INFO] Running unit tests" TMPDIR=$tempdir checked_exec ./test_reframe.py ${parallel_opts} \ - --rfm-user-config=config/cscs-ci.py \ + --rfm-user-config=ci-scripts/configs/cscs-ci.py \ -W=error::reframe.core.warnings.ReframeDeprecationWarning -ra fi diff --git a/config/ci-cluster.py b/ci-scripts/configs/ci-cluster.py similarity index 100% rename from config/ci-cluster.py rename to ci-scripts/configs/ci-cluster.py diff --git a/config/cscs-ci.py b/ci-scripts/configs/cscs-ci.py similarity index 100% rename from config/cscs-ci.py rename to ci-scripts/configs/cscs-ci.py