Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Automatically kill stuck PR tests and report back #2440

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
26 changes: 26 additions & 0 deletions kill-stuck-pr-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash
rm -f *.prop

if [ "X${UPLOAD_UNIQUE_ID}" = "X" ] ; then exit 0 ; fi
if [ "X${PULL_REQUEST}" = "X" ] ; then exit 0 ; fi

REPOSITORY=$(echo ${PULL_REQUEST} | cut -d '#' -f 1)
PR_ID=$(echo ${PULL_REQUEST} | cut -d '#' -f 2)

COMMIT_ID=$(curl -L http://localhost/SDT/jenkins-artifacts/pull-request-integration/${UPLOAD_UNIQUE_ID}/prs_commits.txt | grep "^${PULL_REQUEST}=")
if [ "X${COMMIT_ID}" = "X" ] ; then exit 0 ; fi

./cms-bot/update-commit-statuses-matching.py -r ${REPOSITORY} -c ${COMMIT_ID} -p ${CONTEXT} rocm

touch abort-jenkins-job.prop
echo "JENKINS_PROJECT_TO_KILL=${JENKINS_PROJECT_TO_KILL}" >> abort-jenkins-job.prop
echo "JENKINS_PROJECT_PARAMS=${JENKINS_PROJECT_PARAMS}" >> abort-jenkins-job.prop
echo "EXTRA_PARAMS=${EXTRA_PARAMS}" >> abort-jenkins-job.prop

source $(dirname $0)/setup-pr-test-env.sh

echo "MATRIXROCM_TESTS;ERROR,Matrix ROCM Tests Outputs,Timed out waiting for node,none" > ${RESULTS_DIR}/relvalROCM.txt
echo "RelVals-ROCM" > ${RESULTS_DIR}/12ROCM-relvals-failed.res
echo "rocm_UNIT_TEST_RESULTS;ERROR,ROCM GPU Unit Tests,Timed out waiting for node,none" > ${RESULTS_DIR}/unittestrocm.txt
echo "rocmUnitTests" > ${RESULTS_DIR}/14-failed.res
prepare_upload_results
5 changes: 5 additions & 0 deletions parse_jenkins_builds.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"whitelist": ["ib-run-pr-unittests", "ib-run-pr-relvals", "ib-run-baseline"],
"timeout": 3600,
"custom": {}
}
62 changes: 52 additions & 10 deletions parse_jenkins_builds.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import subprocess
from es_utils import send_payload, get_payload, resend_payload, get_payload_wscroll
from cmsutils import epoch2week
import json

JENKINS_PREFIX = "jenkins"
try:
Expand Down Expand Up @@ -152,22 +153,63 @@ def grep(filename, pattern, verbose=False):
jenkins_queue = dict()
current_time = get_current_time()
for element in queue_json["items"]:
payload = dict()

job_name = element["task"]["name"]
queue_id = int(element["id"])
queue_time = int(element["inQueueSince"])
labels = element["why"].encode("ascii", "ignore").decode("ascii", "ignore")
reason = process_queue_reason(labels)

payload["jenkins_server"] = JENKINS_PREFIX
payload["in_queue_since"] = queue_time
payload["queue_id"] = queue_id
payload["job_name"] = job_name
payload["node_labels"] = reason
payload["in_queue"] = 1
payload["wait_time"] = current_time - queue_time
payload["start_time"] = 0
payload = {
"jenkins_server": JENKINS_PREFIX,
"in_queue_since": queue_time,
"queue_id": queue_id,
"job_name": job_name,
"node_labels": reason,
"in_queue": 1,
"wait_time": current_time - queue_time,
"start_time": 0,
}

kill_index = 0

with open("parse_jenkins_builds.json") as f:
config = json.load(f)

# Abort stuck jobs
if (
job_name in config["whitelist"]
and reason.endswith("-offline")
and reason != "multiple-offline"
and (payload["wait_time"] / 1000 > config["custom"].get(job_name, config["timeout"]))
):
params = dict(
line.split("=", 1) for line in element["params"].strip().splitlines() if "=" in line
)

if "rocm" not in (params.get("GPU_FLAVOR"), params.get("TEST_FLAVOR")):
continue

try:
pull_request = params["PULL_REQUEST"]
main_params = f"PULL_REQUEST={pull_request}"
release = params["RELEASE_FORMAT"]
context = params["CONTEXT_PREFIX"]
upload_unique_id = params["UPLOAD_UNIQ_ID"]
except KeyError:
continue

other_params = ";".join(f"{k}={v}" for k, v in params if k != "PULL_REQUEST")

with open(f"abort-{kill_index}.prop", "w") as f:
f.write(f"UPLOAD_UNIQ_ID={upload_unique_id}\n")
f.write(f"PULL_REQUEST={pull_request}\n")
f.write(f"CONTEXT={context}\n")
f.write(f"JENKINS_PROJECT_TO_KILL={job_name}\n")
f.write(f"JENKINS_PROJECT_PARAMS={main_params}\n")
f.write(f"EXTRA_PARAMS={other_params}\n")
f.write(f"RELEASE_FORMAT={release}\n")

kill_index += 1

unique_id = (
JENKINS_PREFIX + ":/build/builds/" + job_name + "/" + str(queue_id)
Expand Down
36 changes: 36 additions & 0 deletions update-commit-statues-matching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import github_utils
import argparse


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--repository", "-r")
parser.add_argument("--commit", "-c")
parser.add_argument("--prefix", "-p")
parser.add_argument("suffix")
args = parser.parse_args()

status_prefix = f"{args.prefix}/"

all_statuses = github_utils.get_combined_statuses(args.commit, args.repository).get(
"statuses", []
)

for status in all_statuses:
if (
status["context"].startswith(status_prefix)
and status["context"].endswith(f"/{args.suffix}")
and status["state"] == "pending"
):
github_utils.mark_commit_status(
args.commit,
args.repository,
status["context"],
"success",
"",
"Timed out waiting for node",
)


if __name__ == "__main__":
main()
Loading