diff --git a/scripts/tf-tests/.gitignore b/scripts/tf-tests/.gitignore new file mode 100644 index 00000000..a7b95ae9 --- /dev/null +++ b/scripts/tf-tests/.gitignore @@ -0,0 +1,3 @@ +hwctl +job_outputs/* +machines.txt diff --git a/scripts/tf-tests/README.md b/scripts/tf-tests/README.md new file mode 100644 index 00000000..712707a0 --- /dev/null +++ b/scripts/tf-tests/README.md @@ -0,0 +1,129 @@ +# Manual integration tests using Testflinger + +The purpose of the `run-jobs.py` script is to test hardware-api client +and server on the machines that are accessible via +[Testflinger](https://github.com/canonical/testflinger). It allows us +to test the project components on multiple machines automatically. + +## Requirements + +These scripts use [the testflinger +snap](https://snapcraft.io/testflinger-cli), so make sure you have it +installed on your system: + +```sh +sudo snap install testflinger-cli +``` + +Also, the following files are required to be present in this +directory: + +- `machines.txt`: This file lists the Canonical IDs of the machines on + which jobs will be run. Each Canonical ID should be on a separate + line, formatted as follows: + + ```text + 202101-28595 + 202012-28526 + ... + ``` + +- `tf-job.yaml`: This YAML template defines the job parameters. The + template should include the placeholder `$CANONICAL_ID`, which will + be replaced with each actual Canonical ID from `machines.txt` when + jobs are submitted. You can modify the existing `tf-job.yaml` file + to run other commands or use a different Ubuntu distro. + +- `hwctl`: The client executable. You can create it by running `cargo + build --release` in the project root directory. You're supposed to + build the binary on the same Ubuntu release that is specified in + `tf-job.yaml`. The machines should also be of the same architecture + the binary is build for. + + Then copy the created file to this directory: + + ```sh + cp target/release/hwctl scripts/tf-tests/ + ``` + +## Running the script + +After you meet the described requirements, make sure you have access +to +[https://testflinger.canonical.com](https://testflinger.canonical.com). + +The script `run-jobs.py` can be used to submit jobs, monitor their +status, or both, depending on the options you provide. + +```sh +../tf_test.py [options] +``` + +Examples: + +* Submit Jobs and Monitor Statuses Sequentially: `./tf_test.py` +* Only Submit Jobs: `./tf_test.py --send-jobs` +* Only Monitor Job Statuses: `./tf_test.py --check-status` +* Custom Machines File and Poll Interval: `./tf_test.py + --machines-file custom_machines.txt --poll-interval 60` + + +## Script overview + +The script performs two main functions: + +- Job Submission +- Job Monitoring + +### Job Submission + +When submitting jobs, the script: + +1. Reads each Canonical ID from `machines.txt` (or the file specified + with `--machines-file`). +2. Replaces `$CANONICAL_ID` in `tf-job.yaml` (or the file specified + with `--template-file`) with the actual ID. +3. Submits the job with `testflinger submit `. +4. Captures the job UUID returned after submission. +5. Creates a directory for each Canonical ID in `job_outputs/` (or the + directory specified with `--output-dir`) and saves the job UUID in + a file named `tf_job_id.txt` within that directory. + +Example directory structure after job submission: + +``` +job_outputs/ +├── 202101-28595/ +│ └── tf_job_id.txt # Contains the job UUID +├── 202012-28526/ +│ └── tf_job_id.txt +``` + +### Job Monitoring + +When monitoring jobs, the script: + +1. Reads `tf_job_id.txt` files in `job_outputs/` to get the job UUIDs. +2. Enters a loop, checking the status of each job using `testflinger + status `. +3. For jobs with status "complete", retrieves results using + `testflinger-cli results `. +4. Saves the test output to `output.txt` within the respective + Canonical ID’s directory. +5. Extracts the status field from the test output and writes it to + `hw_status.txt`. +6. Continues monitoring until all jobs are completed. + +Example directory structure after job monitoring: + +``` +job_outputs/ +├── 202101-28595/ +│ ├── tf_job_id.txt +│ ├── output.txt # Contains test output +│ └── hw_status.txt # Contains the hardware API status +├── 202012-28526/ +│ ├── tf_job_id.txt +│ ├── output.txt +│ └── hw_status.txt +``` diff --git a/scripts/tf-tests/run-jobs.py b/scripts/tf-tests/run-jobs.py new file mode 100755 index 00000000..57b043ce --- /dev/null +++ b/scripts/tf-tests/run-jobs.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 + +import re +import argparse +import subprocess +import json +import logging +from time import sleep +from pathlib import Path +from typing import Dict, Optional + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Submit jobs and monitor their status on Testflinger." + ) + parser.add_argument( + "--machines-file", + type=Path, + default="machines.txt", + help="Path to the file with machines Canonical IDs", + ) + parser.add_argument( + "--template-file", + type=Path, + default="tf-job.yaml", + help="Path to Testflinger job template file", + ) + parser.add_argument( + "--output-dir", + type=Path, + default="job_outputs", + help="Path to job outputs directory", + ) + parser.add_argument( + "--poll-interval", + type=int, + default=30, + help="Time delay between status checks in seconds", + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--send-jobs", + action="store_true", + help="Only submit jobs without monitoring their statuses", + ) + group.add_argument( + "--check-status", + action="store_true", + help="Only check job statuses without submitting new jobs", + ) + return parser.parse_args() + + +# Job Submission Functions +def load_canonical_ids(filename: Path) -> list: + """Reads the Canonical IDs from a file.""" + with open(filename, "r", encoding="utf8") as file: + return file.read().strip().splitlines() + + +def create_job_yaml(template_file: Path, canonical_id: str) -> str: + """Creates a modified job YAML for a specific Canonical ID.""" + with open(template_file, "r", encoding="utf8") as file: + job_yaml = file.read() + return job_yaml.replace("$CANONICAL_ID", canonical_id) + + +def write_temp_job_file(job_yaml: str, output_dir: Path, canonical_id: str) -> Path: + """Writes the modified job YAML to a temporary file.""" + temp_job_file = output_dir / f"{canonical_id}_tf-job.yaml" + temp_job_file.write_text(job_yaml) + return temp_job_file + + +def submit_job(temp_job_file: Path, canonical_id: str) -> Optional[str]: + """Submits the job and returns the job UUID.""" + try: + result = subprocess.run( + ["testflinger", "submit", "--attachments-relative-to", ".", str(temp_job_file)], + capture_output=True, + text=True, + check=True, + ) + for line in result.stdout.splitlines(): + if line.startswith("job_id:"): + return line.split(": ")[1].strip() + logging.warning("Failed to retrieve job_id for %s", canonical_id) + except subprocess.CalledProcessError as e: + logging.error("Error submitting job for %s: %s", canonical_id, e.stderr) + return None + + +def save_job_uuid(job_uuid: str, output_dir: Path, canonical_id: str): + """Creates a directory for the Canonical ID and saves the job UUID.""" + id_dir = output_dir / canonical_id + id_dir.mkdir(exist_ok=True) + (id_dir / "tf_job_id.txt").write_text(job_uuid) + logging.info("Job submitted for %s with job_id: %s", canonical_id, job_uuid) + + +def submit_all_jobs( + machines_file: Path, template_file: Path, output_dir: Path +) -> Dict[str, Path]: + """Submit all jobs for the given machines.""" + canonical_ids = load_canonical_ids(machines_file) + job_ids = {} + + for canonical_id in canonical_ids: + job_yaml = create_job_yaml(template_file, canonical_id) + temp_job_file = write_temp_job_file(job_yaml, output_dir, canonical_id) + job_uuid = submit_job(temp_job_file, canonical_id) + + if job_uuid: + save_job_uuid(job_uuid, output_dir, canonical_id) + job_ids[job_uuid] = output_dir / canonical_id + + temp_job_file.unlink() # Clean up temporary YAML file + + return job_ids + + +# Job Monitoring Functions +def load_jobs(output_dir: Path) -> Dict[str, Path]: + """Load job IDs and directories from the job outputs directory.""" + jobs = {} + for id_dir in output_dir.iterdir(): + if id_dir.is_dir(): + job_id_file = id_dir / "tf_job_id.txt" + if job_id_file.exists(): + job_id = job_id_file.read_text().strip() + jobs[job_id] = id_dir + return jobs + + +def check_job_status(job_id: str) -> Optional[str]: + """Check the status of a job by its job ID.""" + try: + result = subprocess.run( + ["testflinger", "status", job_id], + capture_output=True, + text=True, + check=True, + ) + return result.stdout.strip() + except subprocess.CalledProcessError as e: + logging.error("Error checking status for job %s: %s", job_id, e.stderr) + return None + + +def extract_status_from_output(test_output): + """Extracts the status value from the test output JSON.""" + match_ = re.search(r'"status":\s*"([^"]+)"', test_output) + return match_.group(1) if match_ else "Unknown" + + +def retrieve_job_results(job_id: str, id_dir: Path): + """Retrieve and save the results of a completed job.""" + try: + results_result = subprocess.run( + ["testflinger-cli", "results", job_id], + capture_output=True, + text=True, + check=True, + ) + results_data = json.loads(results_result.stdout) + test_output = results_data.get("test_output", "") + (id_dir / "output.txt").write_text(test_output) + status = extract_status_from_output(test_output) + (id_dir / "hw_status.txt").write_text(status) + logging.info("Results and status saved for job %s in %s", job_id, id_dir) + except (subprocess.CalledProcessError, json.JSONDecodeError) as e: + logging.error("Error fetching results for job %s: %s", job_id, str(e)) + + +def monitor_jobs(remaining_jobs: Dict[str, Path], poll_interval: int): + """Monitor jobs until all are completed, fetching results as jobs finish.""" + while remaining_jobs: + for job_id, id_dir in list(remaining_jobs.items()): + job_status = check_job_status(job_id) + logging.info( + "Status for job %s (Canonical ID: %s): %s", + job_id, + id_dir.name, + job_status, + ) + if job_status == "cancelled": + logging.error("The job %s got cancelled.", job_id) + del remaining_jobs[job_id] + if job_status == "complete": + retrieve_job_results(job_id, id_dir) + del remaining_jobs[job_id] + + if remaining_jobs: + logging.info("Waiting %d seconds before checking again...", poll_interval) + sleep(poll_interval) + + logging.info("All jobs complete and results retrieved.") + + +def main(): + logging.basicConfig(level=logging.INFO, format="%(message)s") + args = parse_arguments() + args.output_dir.mkdir(exist_ok=True) + + if args.send_jobs: + submit_all_jobs(args.machines_file, args.template_file, args.output_dir) + elif args.check_status: + remaining_jobs = load_jobs(args.output_dir) + monitor_jobs(remaining_jobs, args.poll_interval) + else: + job_ids = submit_all_jobs( + args.machines_file, args.template_file, args.output_dir + ) + monitor_jobs(job_ids, args.poll_interval) + + +if __name__ == "__main__": + main() diff --git a/scripts/tf-tests/tf-job.yaml b/scripts/tf-tests/tf-job.yaml new file mode 100644 index 00000000..3c724e61 --- /dev/null +++ b/scripts/tf-tests/tf-job.yaml @@ -0,0 +1,11 @@ +job_queue: "$CANONICAL_ID" +provision_data: + distro: jammy +test_data: + attachments: + - local: hwctl + test_cmds: | + scp attachments/test/hwctl $DEVICE_IP: + ssh $DEVICE_IP "sudo apt-get update && sudo apt install -y pkgconf libssl-dev" + ssh $DEVICE_IP "sudo dmidecode" + ssh $DEVICE_IP "sudo HW_API_URL=https://hw.staging.ubuntu.com ./hwctl"