diff --git a/README.md b/README.md index 1d9fe11..b786871 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,29 @@ with an empty commit, do git commit --allow-empty -m "trigger pipeline" && git push +A portion of the CI is run on LANL's internal Darwin platform. To launch this CI job, someone with +Darwin access (usually a LANL employee) must first create a Github Personal Access Token, like so: + +- `github.com` profile -> `Settings` -> `Developer Settings` -> `Personal Access Tokens` -> `Tokens (classic)` +- Click the `Generate New Token` button -> `Generate New Token (classic)` +- Name it something like `jaybenne_token` in the `Note` box +- Click the `workflow` checkbox (which will also check the `repo` boxes) +- `Generate token` +- You only get to see the token once, so immediately copy it. + +Store the token securely in your own environment as `JAYBENNE_GITHUB_TOKEN`, e.g. in your Darwin `~/.bashrc`: + + export JAYBENNE_GITHUB_TOKEN=[token] + +and then, again from Darwin, manually launch the CI runner: + + cd jaybenne + ./tst/launch_ci_runner.py [Number of the github PR] + +Note that `launch_ci_runner.py` will create a temporary checkout of the current state of the branch associated +with this PR according to the `origin` remote, so you don't need to worry about the state of your local checkout +of `jaybenne`. + # Run driver executable cd build/src diff --git a/tst/launch_ci_runner.py b/tst/launch_ci_runner.py new file mode 100755 index 0000000..6c28e81 --- /dev/null +++ b/tst/launch_ci_runner.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python3 +# ======================================================================================== +# (C) (or copyright) 2025. Triad National Security, LLC. All rights reserved. +# +# This program was produced under U.S. Government contract 89233218CNA000001 for Los +# Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC +# for the U.S. Department of Energy/National Nuclear Security Administration. All rights +# in the program are reserved by Triad National Security, LLC, and the U.S. Department +# of Energy/National Nuclear Security Administration. The Government is granted for +# itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide +# license in this material to reproduce, prepare derivative works, distribute copies to +# the public, perform publicly and display publicly, and to permit others to do so. +# ======================================================================================== + +# This file was created in part or in whole by one of OpenAI's generative AI models + +import subprocess +import socket +import fnmatch +import os +import requests +import sys +import json +import subprocess +import argparse +import tempfile +import shlex +from datetime import datetime + +# The personal access token (PAT) with 'repo:status' permission +# Store your token securely and do not hardcode it in the script +GITHUB_TOKEN = os.environ.get("JAYBENNE_GITHUB_TOKEN") + + +def get_pr_info(pr_number): + url = f"https://api.github.com/repos/lanl/jaybenne/pulls/{pr_number}" + headers = {"Authorization": f"token {GITHUB_TOKEN}"} + response = requests.get(url, headers=headers) + if response.status_code != 200: + print(f"Error fetching PR info: {response.status_code}") + print(response.text) + sys.exit(1) + return response.json() + + +def update_status( + commit_sha, state, description, context="Continuous Integration / darwin_volta-x86" +): + url = f"https://api.github.com/repos/lanl/jaybenne/statuses/{commit_sha}" + headers = {"Authorization": f"token {GITHUB_TOKEN}"} + data = {"state": state, "description": description, "context": context} + response = requests.post(url, headers=headers, data=json.dumps(data)) + if response.status_code != 201: + print(f"Error setting status: {response.status_code}") + print(response.text) + sys.exit(1) + + +def run_tests_in_temp_dir(pr_number, head_repo, head_ref, output_dir): + current_dir = os.getcwd() + + # Create a temporary directory + with tempfile.TemporaryDirectory() as temp_dir: + print(f"Using temporary directory: {temp_dir}") + + # Clone the repository into the temporary directory + subprocess.run(["git", "clone", head_repo, temp_dir], check=True) + os.chdir(temp_dir) + + # Checkout the PR branch + subprocess.run(["git", "pull", "--no-rebase", "origin", head_ref], check=True) + + # Update submodules + subprocess.run( + ["git", "submodule", "update", "--init", "--recursive"], check=True + ) + + # Run the tests + os.chdir(os.path.join(temp_dir, "tst")) + build_dir = os.path.join(temp_dir, "build") + + # Run subprocess command to compile code and launch run_tests.py + test_command = [ + "bash", + "-c", + "source ../env/bash && build_jaybenne -b " + + build_dir + + " -f && cd " + + os.path.join(temp_dir, "tst") + + " && ./stepdiff.py --executable " + + os.path.join(build_dir, "mcblock") + + " --input ../inputs/stepdiff.in" + + " --use_mpiexec" + + " && ./stepdiff.py --executable " + + os.path.join(build_dir, "mcblock") + + " --input ../inputs/stepdiff_ddmc.in --use_mpiexec" + + " && ./stepdiff_smr.py --executable " + + os.path.join(build_dir, "mcblock") + + " --input ../inputs/stepdiff_smr.in --use_mpiexec" + + " && ./stepdiff_smr.py --executable " + + os.path.join(build_dir, "mcblock") + + " --input ../inputs/stepdiff_smr_ddmc.in --use_mpiexec" + + " && ./stepdiff_smr.py --executable " + + os.path.join(build_dir, "mcblock") + + " --input ../inputs/stepdiff_smr_ddmc.in --use_mpiexec --mpi_nthreads 8" + + " && ./stepdiff_smr.py --executable " + + os.path.join(build_dir, "mcblock") + + " --input ../inputs/stepdiff_smr_hybrid.in --use_mpiexec" + + " && ./stepdiff_smr.py --executable " + + os.path.join(build_dir, "mcblock") + + " --input ../inputs/stepdiff_smr_hybrid.in --use_mpiexec --mpi_nthreads 8", + ] + ret = subprocess.run(test_command, check=True) + + # Return true if the test script succeeded + return ret.returncode == 0 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Run CI tasks with optional Slurm submission." + ) + parser.add_argument( + "pr_number", type=int, help="Pull request number for the CI run." + ) + parser.add_argument( + "--submission", + action="store_true", + help="Flag to indicate the script is running as a Slurm submission job.", + ) + parser.add_argument( + "--output_dir", + type=str, + default=None, + help="Output directory created when launching submission script", + ) + args = parser.parse_args() + + # Fetch PR information + pr_info = get_pr_info(args.pr_number) + head_repo = pr_info["head"]["repo"]["clone_url"] + head_ref = pr_info["head"]["ref"] + commit_sha = pr_info["head"]["sha"] + + if args.submission: + # Update github PR status to indicate we have begun testing + update_status(commit_sha, "pending", "CI Slurm job running...") + + # Run the tests in a temporary directory + test_success = run_tests_in_temp_dir( + args.pr_number, head_repo, head_ref, args.output_dir + ) + + # Update github PR status to indicate that testing has concluded + if test_success: + update_status(commit_sha, "success", "All tests passed.") + else: + update_status(commit_sha, "failure", "Tests failed.") + else: + # Check that we are on the right system + hostname = socket.gethostname() + cluster = os.getenv("SLURM_CLUSTER_NAME") + + if not fnmatch.fnmatch(hostname, "darwin-fe*"): + # if we are on a backend + if cluster is None or cluster.lower() != "darwin": + print("ERROR script must be run from Darwin!") + sys.exit(1) + + # Execute the sbatch command + try: + # Submit batch job with ci_runner script that will checkout and build the code and run + # tests + job_name = f"jaybenne_ci_darwin_volta-x86_PR{args.pr_number}" + + # Clean up existing jobs for same PR + squeue_command = f"squeue --name={shlex.quote(job_name)} --user=$(whoami) --noheader --format=%i" + squeue_result = subprocess.run( + squeue_command, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ) + + job_ids = squeue_result.stdout.strip().split() + if len(job_ids) >= 1: + print("Canceling jobs:") + for job_id in job_ids: + print(f" {job_id}") + + # Use scancel to cancel the jobs + scancel_command = ["scancel"] + job_ids + scancel_result = subprocess.run( + scancel_command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ) + + # Build output path and create directory if necessary + username = os.getenv("USER") + current_date_time = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + output_dir = os.path.join( + "/usr", + "projects", + "jovian", + "ci", + "jaybenne", + f"pr_{args.pr_number}", + current_date_time, + ) + subprocess.run(["mkdir", "-p", output_dir], check=True) + + # Create subprocess command for submitting CI job, and submit + sbatch_command = [ + "sbatch", + f"--job-name={job_name}", + f"--output={os.path.join(output_dir, job_name)}_%j.out", + f"--error={os.path.join(output_dir, job_name)}_%j.out", + "--partition=volta-x86", + "--time=04:00:00", + "--wrap", + f"python3 {sys.argv[0]} {args.pr_number} --submission --output_dir {output_dir}", + ] + result = subprocess.run( + sbatch_command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True, + universal_newlines=True, + ) + print(result.stdout.strip()) + + # Update PR status that we have successfully submitted to SLURM job + update_status(commit_sha, "pending", "CI SLURM job submitted...") + except subprocess.CalledProcessError: + # Update PR status that we have failed to submit the SLURM job + update_status(commit_sha, "failure", "SLURM job submission failed.")