Skip to content

Commit

Permalink
Merge pull request #3268 from teojgo/ci/slurm_multinode
Browse files Browse the repository at this point in the history
[ci] Run scheduler unittests with GitHub Actions
  • Loading branch information
vkarak authored Oct 1, 2024
2 parents b8fd652 + e5d2d81 commit 2ef03f0
Show file tree
Hide file tree
Showing 10 changed files with 281 additions and 171 deletions.
114 changes: 114 additions & 0 deletions .github/pseudo-cluster/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
services:
munge-key-generator:
image: ghcr.io/reframe-hpc/munge-ubuntu:20.04
hostname: munge-host
healthcheck:
test: ["CMD-SHELL", "test -f /scratch/munge.key"]
interval: 10s
timeout: 10s
retries: 5
volumes:
- shared-scratch:/scratch

frontend:
image: slurm-reframe
container_name: frontend
build:
dockerfile: .github/pseudo-cluster/reframe/Dockerfile
context: ../../
hostname: login
user: admin
init: True
volumes:
- shared-home:/home/admin:rw
- shared-scratch:/scratch:rw
links:
- slurm-master
depends_on:
munge-key-generator:
condition: service_healthy
slurm-master:
condition: service_started
node0:
condition: service_started
node1:
condition: service_started
node2:
condition: service_started
environment:
- SLURM_CPUS_ON_NODE=1
- BACKEND=${BACKEND:-squeue}

slurm-master:
image: ghcr.io/reframe-hpc/slurm-master-ubuntu:20.04
hostname: slurm-master
user: admin
volumes:
- shared-home:/home/admin
- shared-scratch:/scratch:rw
depends_on:
munge-key-generator:
condition: service_healthy
environment:
- SLURM_CPUS_ON_NODE=1

node0:
image: ghcr.io/reframe-hpc/slurm-node-ubuntu:20.04
hostname: nid00
container_name: slurm-node0
user: admin
volumes:
- shared-home:/home/admin
- shared-scratch:/scratch:rw
environment:
- SLURM_NODENAME=nid00
- SLURM_CPUS_ON_NODE=1
depends_on:
munge-key-generator:
condition: service_healthy
slurm-master:
condition: service_started
links:
- slurm-master

node1:
image: ghcr.io/reframe-hpc/slurm-node-ubuntu:20.04
hostname: nid01
container_name: slurm-node1
user: admin
volumes:
- shared-home:/home/admin
- shared-scratch:/scratch:rw
environment:
- SLURM_NODENAME=nid01
- SLURM_CPUS_ON_NODE=1
depends_on:
munge-key-generator:
condition: service_healthy
slurm-master:
condition: service_started
links:
- slurm-master

node2:
image: ghcr.io/reframe-hpc/slurm-node-ubuntu:20.04
hostname: nid02
container_name: slurm-node2
user: admin
volumes:
- shared-home:/home/admin
- shared-scratch:/scratch:rw
environment:
- SLURM_NODENAME=nid02
- SLURM_CPUS_ON_NODE=1
depends_on:
munge-key-generator:
condition: service_healthy
slurm-master:
condition: service_started
links:
- slurm-master

volumes:
shared-home:
shared-scratch:
42 changes: 42 additions & 0 deletions .github/pseudo-cluster/reframe/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
FROM ubuntu:20.04

ARG DEBIAN_FRONTEND=noninteractive

RUN apt update -y && \
apt install -y \
build-essential \
clang jq libomp-dev tree vim \
git \
mariadb-client \
munge \
slurm-client \
slurm-wlm-torque \
sudo \
python3 \
python3-pip \
wget \
curl \
mpich \
libmpich-dev && \
rm -rf /var/lib/apt/lists/*

RUN useradd -m admin -s /usr/bin/bash -d /home/admin && \
echo "admin:admin" | chpasswd && adduser admin sudo && \
echo "admin ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers

COPY .github/pseudo-cluster/reframe/slurm.conf /etc/slurm-llnl/
COPY .github/pseudo-cluster/reframe/cgroup.conf /etc/slurm-llnl/
COPY .github/pseudo-cluster/reframe/docker-entrypoint.sh /etc/slurm-llnl/
COPY . /usr/local/share/reframe

RUN mkdir /scratch && \
chown -R admin:admin /scratch

RUN chmod +rx /etc/slurm-llnl/docker-entrypoint.sh

WORKDIR /home/admin

ENV USER admin
ENV SHELL bash

ENTRYPOINT ["/etc/slurm-llnl/docker-entrypoint.sh"]
1 change: 1 addition & 0 deletions .github/pseudo-cluster/reframe/cgroup.conf
24 changes: 24 additions & 0 deletions .github/pseudo-cluster/reframe/docker-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

trap exit 0 INT

while [ ! -f /scratch/munge.key ]
do
sleep 1
done

sudo cp /scratch/munge.key /etc/munge/munge.key
sudo service munge start
sudo sed -i "s/REPLACE_IT/CPUs=${SLURM_CPUS_ON_NODE}/g" /etc/slurm-llnl/slurm.conf

# Needs to be copied in the shared home directory
cp -r /usr/local/share/reframe .
cd reframe
./bootstrap.sh

echo "Running unittests with backend scheduler: ${BACKEND}"

tempdir=$(mktemp -d -p /scratch)
TMPDIR=$tempdir ./test_reframe.py -v \
--rfm-user-config=ci-scripts/configs/ci-cluster.py \
--rfm-user-system=pseudo-cluster:compute-${BACKEND:-squeue}
1 change: 1 addition & 0 deletions .github/pseudo-cluster/reframe/slurm.conf
24 changes: 24 additions & 0 deletions .github/workflows/test-schedulers.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: ReFrame CI / Scheduler backend tests
on:
pull_request: []

jobs:
scheduler-test:
runs-on: ubuntu-latest
strategy:
matrix:
scheduler: ['pbs', 'squeue', 'torque']
steps:
- uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build Images
run: |
docker compose -f .github/pseudo-cluster/docker-compose.yml build
- name: Run Unittests with ${{ matrix.scheduler }} sceduler
run: |
BACKEND=${{ matrix.scheduler }} docker compose -f .github/pseudo-cluster/docker-compose.yml up --abort-on-container-exit --exit-code-from frontend
169 changes: 0 additions & 169 deletions Jenkinsfile

This file was deleted.

Loading

0 comments on commit 2ef03f0

Please sign in to comment.