diff --git a/kill-stuck-pr-test.sh b/kill-stuck-pr-test.sh new file mode 100755 index 00000000000..0ede63fa189 --- /dev/null +++ b/kill-stuck-pr-test.sh @@ -0,0 +1,26 @@ +#!/bin/bash +rm -f *.prop + +if [ "X${UPLOAD_UNIQUE_ID}" = "X" ] ; then exit 0 ; fi +if [ "X${PULL_REQUEST}" = "X" ] ; then exit 0 ; fi + +REPOSITORY=$(echo ${PULL_REQUEST} | cut -d '#' -f 1) +PR_ID=$(echo ${PULL_REQUEST} | cut -d '#' -f 2) + +COMMIT_ID=$(curl -L http://localhost/SDT/jenkins-artifacts/pull-request-integration/${UPLOAD_UNIQUE_ID}/prs_commits.txt | grep "^${PULL_REQUEST}=") +if [ "X${COMMIT_ID}" = "X" ] ; then exit 0 ; fi + +./cms-bot/update-commit-statuses-matching.py -r ${REPOSITORY} -c ${COMMIT_ID} -p ${CONTEXT} rocm + +touch abort-jenkins-job.prop +echo "JENKINS_PROJECT_TO_KILL=${JENKINS_PROJECT_TO_KILL}" >> abort-jenkins-job.prop +echo "JENKINS_PROJECT_PARAMS=${JENKINS_PROJECT_PARAMS}" >> abort-jenkins-job.prop +echo "EXTRA_PARAMS=${EXTRA_PARAMS}" >> abort-jenkins-job.prop + +source $(dirname $0)/setup-pr-test-env.sh + +echo "MATRIXROCM_TESTS;ERROR,Matrix ROCM Tests Outputs,Timed out waiting for node,none" > ${RESULTS_DIR}/relvalROCM.txt +echo "RelVals-ROCM" > ${RESULTS_DIR}/12ROCM-relvals-failed.res +echo "rocm_UNIT_TEST_RESULTS;ERROR,ROCM GPU Unit Tests,Timed out waiting for node,none" > ${RESULTS_DIR}/unittestrocm.txt +echo "rocmUnitTests" > ${RESULTS_DIR}/14-failed.res +prepare_upload_results \ No newline at end of file diff --git a/parse_jenkins_builds.json b/parse_jenkins_builds.json new file mode 100644 index 00000000000..848f89cd53e --- /dev/null +++ b/parse_jenkins_builds.json @@ -0,0 +1,5 @@ +{ +"whitelist": ["ib-run-pr-unittests", "ib-run-pr-relvals", "ib-run-baseline"], +"timeout": 3600, +"custom": {} +} diff --git a/parse_jenkins_builds.py b/parse_jenkins_builds.py index 4d267eb0466..ed12630715b 100755 --- a/parse_jenkins_builds.py +++ b/parse_jenkins_builds.py @@ -5,6 +5,7 @@ import subprocess from es_utils import send_payload, get_payload, resend_payload, get_payload_wscroll from cmsutils import epoch2week +import json JENKINS_PREFIX = "jenkins" try: @@ -152,22 +153,63 @@ def grep(filename, pattern, verbose=False): jenkins_queue = dict() current_time = get_current_time() for element in queue_json["items"]: - payload = dict() - job_name = element["task"]["name"] queue_id = int(element["id"]) queue_time = int(element["inQueueSince"]) labels = element["why"].encode("ascii", "ignore").decode("ascii", "ignore") reason = process_queue_reason(labels) - payload["jenkins_server"] = JENKINS_PREFIX - payload["in_queue_since"] = queue_time - payload["queue_id"] = queue_id - payload["job_name"] = job_name - payload["node_labels"] = reason - payload["in_queue"] = 1 - payload["wait_time"] = current_time - queue_time - payload["start_time"] = 0 + payload = { + "jenkins_server": JENKINS_PREFIX, + "in_queue_since": queue_time, + "queue_id": queue_id, + "job_name": job_name, + "node_labels": reason, + "in_queue": 1, + "wait_time": current_time - queue_time, + "start_time": 0, + } + + kill_index = 0 + + with open("parse_jenkins_builds.json") as f: + config = json.load(f) + + # Abort stuck jobs + if ( + job_name in config["whitelist"] + and reason.endswith("-offline") + and reason != "multiple-offline" + and (payload["wait_time"] / 1000 > config["custom"].get(job_name, config["timeout"])) + ): + params = dict( + line.split("=", 1) for line in element["params"].strip().splitlines() if "=" in line + ) + + if "rocm" not in (params.get("GPU_FLAVOR"), params.get("TEST_FLAVOR")): + continue + + try: + pull_request = params["PULL_REQUEST"] + main_params = f"PULL_REQUEST={pull_request}" + release = params["RELEASE_FORMAT"] + context = params["CONTEXT_PREFIX"] + upload_unique_id = params["UPLOAD_UNIQ_ID"] + except KeyError: + continue + + other_params = ";".join(f"{k}={v}" for k, v in params if k != "PULL_REQUEST") + + with open(f"abort-{kill_index}.prop", "w") as f: + f.write(f"UPLOAD_UNIQ_ID={upload_unique_id}\n") + f.write(f"PULL_REQUEST={pull_request}\n") + f.write(f"CONTEXT={context}\n") + f.write(f"JENKINS_PROJECT_TO_KILL={job_name}\n") + f.write(f"JENKINS_PROJECT_PARAMS={main_params}\n") + f.write(f"EXTRA_PARAMS={other_params}\n") + f.write(f"RELEASE_FORMAT={release}\n") + + kill_index += 1 unique_id = ( JENKINS_PREFIX + ":/build/builds/" + job_name + "/" + str(queue_id) diff --git a/update-commit-statues-matching.py b/update-commit-statues-matching.py new file mode 100755 index 00000000000..638e5f23676 --- /dev/null +++ b/update-commit-statues-matching.py @@ -0,0 +1,36 @@ +import github_utils +import argparse + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--repository", "-r") + parser.add_argument("--commit", "-c") + parser.add_argument("--prefix", "-p") + parser.add_argument("suffix") + args = parser.parse_args() + + status_prefix = f"{args.prefix}/" + + all_statuses = github_utils.get_combined_statuses(args.commit, args.repository).get( + "statuses", [] + ) + + for status in all_statuses: + if ( + status["context"].startswith(status_prefix) + and status["context"].endswith(f"/{args.suffix}") + and status["state"] == "pending" + ): + github_utils.mark_commit_status( + args.commit, + args.repository, + status["context"], + "success", + "", + "Timed out waiting for node", + ) + + +if __name__ == "__main__": + main()