diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml index 5453ff3a..b4755151 100644 --- a/.github/workflows/e2e-nvidia-l4-x1.yml +++ b/.github/workflows/e2e-nvidia-l4-x1.yml @@ -41,6 +41,12 @@ jobs: label: ${{ steps.start-ec2-runner.outputs.label }} ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} steps: + - name: "Harden Runner" + # v2.10.1 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 + with: + egress-policy: audit + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: @@ -78,6 +84,12 @@ jobs: permissions: {} steps: + - name: "Harden Runner" + # v2.10.1 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 + with: + egress-policy: audit + - name: Install Packages run: | cat /etc/os-release @@ -141,7 +153,22 @@ jobs: HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | . venv/bin/activate - ./scripts/e2e-ci.sh -m + # set preserve to true so we can retain the logs + ./scripts/e2e-ci.sh -mp + + # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python + # and we know that it will be written into a directory created by `mktemp -d`. + # Given this information, we can use the following command to find the file: + log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl") + mv "${log_file}" training-log.jsonl + + - name: Upload training logs + uses: actions/upload-artifact@v4 + with: + name: training-log.jsonl + path: ./instructlab/training-log.jsonl + retention-days: 1 + overwrite: true stop-medium-ec2-runner: needs: @@ -150,6 +177,12 @@ jobs: runs-on: ubuntu-latest if: ${{ always() }} steps: + - name: "Harden Runner" + # v2.10.1 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 + with: + egress-policy: audit + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: @@ -164,6 +197,40 @@ jobs: github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} label: ${{ needs.start-medium-ec2-runner.outputs.label }} ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }} + + - name: Download loss data + id: download-logs + uses: actions/download-artifact@v4 + with: + name: training-log.jsonl + path: downloaded-data + + - name: Install dependencies + run: | + pip install -r requirements-dev.txt + + - name: Try to upload to s3 + id: upload-s3 + continue-on-error: true + run: | + output_file='./test.md' + python scripts/create-loss-graph.py \ + --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \ + --output-file "${output_file}" \ + --aws-region "${{ vars.AWS_REGION }}" \ + --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ + --base-branch "${{ github.event.pull_request.base.ref }}" \ + --pr-number "${{ github.event.pull_request.number }}" \ + --head-sha "${{ github.event.pull_request.head.sha }}" \ + --origin-repository "${{ github.repository }}" + + cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}" + + - name: Check S3 upload status + if: steps.upload-s3.outcome == 'failure' + run: | + echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate." + echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" e2e-medium-workflow-complete: # we don't want to block PRs on failed EC2 cleanup diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml index 4620d327..0ae7212e 100644 --- a/.github/workflows/e2e-nvidia-l40s-x4.yml +++ b/.github/workflows/e2e-nvidia-l40s-x4.yml @@ -19,6 +19,12 @@ jobs: label: ${{ steps.start-ec2-runner.outputs.label }} ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} steps: + - name: "Harden Runner" + # v2.10.1 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 + with: + egress-policy: audit + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: @@ -54,6 +60,11 @@ jobs: pull-requests: write steps: + - name: "Harden Runner" + # v2.10.1 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 + with: + egress-policy: audit - name: Install Packages run: | cat /etc/os-release @@ -170,7 +181,23 @@ jobs: HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | . venv/bin/activate - ./scripts/e2e-ci.sh -l + + # set preserve to true so we can retain the logs + ./scripts/e2e-ci.sh -lp + + # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python + # and we know that it will be written into a directory created by `mktemp -d`. + # Given this information, we can use the following command to find the file: + log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl") + mv "${log_file}" training-log.jsonl + + - name: Upload training logs + uses: actions/upload-artifact@v4 + with: + name: training-log.jsonl + path: ./instructlab/training-log.jsonl + retention-days: 1 + overwrite: true - name: Add comment to PR if the workflow failed if: failure() && steps.check_pr.outputs.is_pr == 'true' @@ -221,6 +248,12 @@ jobs: runs-on: ubuntu-latest if: ${{ always() }} steps: + - name: "Harden Runner" + # v2.10.1 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 + with: + egress-policy: audit + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: @@ -235,3 +268,37 @@ jobs: github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} label: ${{ needs.start-large-ec2-runner.outputs.label }} ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }} + + - name: Download loss data + id: download-logs + uses: actions/download-artifact@v4 + with: + name: training-log.jsonl + path: downloaded-data + + - name: Install dependencies + run: | + pip install -r requirements-dev.txt + + - name: Try to upload to s3 + id: upload-s3 + continue-on-error: true + run: | + output_file='./test.md' + python scripts/create-loss-graph.py \ + --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \ + --output-file "${output_file}" \ + --aws-region "${{ vars.AWS_REGION }}" \ + --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ + --base-branch "${{ github.event.pull_request.base.ref }}" \ + --pr-number "${{ github.event.pull_request.number }}" \ + --head-sha "${{ github.event.pull_request.head.sha }}" \ + --origin-repository "${{ github.repository }}" + + - name: Check S3 upload status + if: steps.upload-s3.outcome == 'failure' + run: | + echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate." + echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" + + cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}" \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index a0dff1ed..f77c807f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,6 +2,8 @@ -r requirements.txt +matplotlib +numpy pre-commit>=3.0.4,<5.0 pylint>=2.16.2,<4.0 pylint-pydantic diff --git a/scripts/create-loss-graph.py b/scripts/create-loss-graph.py new file mode 100644 index 00000000..e4ab1254 --- /dev/null +++ b/scripts/create-loss-graph.py @@ -0,0 +1,186 @@ +# SPDX-License-Identifier: Apache-2.0 +# Standard +from argparse import ArgumentParser +from pathlib import Path +from subprocess import run +from typing import Dict, List +import json + +# Third Party +from matplotlib import pyplot as plt +from pydantic import BaseModel + + +class Arguments(BaseModel): + log_file: str | None = None + output_file: str + aws_region: str + bucket_name: str + base_branch: str + pr_number: str + head_sha: str + origin_repository: str + + +def render_image(loss_data: List[float], outfile: Path) -> str: + # create the plot + plt.figure() + plt.plot(loss_data) + plt.xlabel("Steps") + plt.ylabel("Loss") + plt.title("Training performance over fixed dataset") + + if outfile.exists(): + outfile.unlink() + + plt.savefig(outfile, format="png") + + +def contents_from_file(log_file: Path) -> List[Dict]: + if not log_file.exists(): + raise FileNotFoundError(f"Log file {log_file} does not exist") + if log_file.is_dir(): + raise ValueError(f"Log file {log_file} is a directory") + with open(log_file, "r") as f: + return [json.loads(l) for l in f.read().splitlines()] + + +def read_loss_data(log_file: Path) -> List[float]: + if not log_file: + raise ValueError("log_file must be provided when source is file") + contents = contents_from_file(log_file) + + # select the loss data + loss_data = [item["total_loss"] for item in contents if "total_loss" in item] + + if not loss_data: + raise ValueError("Loss data is empty") + + # ensure that the loss data is valid + if not all(isinstance(l, float) for l in loss_data): + raise ValueError("Loss data must be a list of floats") + + return loss_data + + +def write_to_s3( + file: Path, + bucket_name: str, + destination: str, +): + if not file.exists(): + raise RuntimeError(f"File {file} does not exist") + + s3_path = f"s3://{bucket_name}/{destination}" + results = run( + ["aws", "s3", "cp", str(file), s3_path], capture_output=True, check=True + ) + if results.returncode != 0: + raise RuntimeError(f"failed to upload to s3: {results.stderr.decode('utf-8')}") + else: + print(results.stdout.decode("utf-8")) + + +def get_destination_path(base_ref: str, pr_number: str, head_sha: str): + return f"pulls/{base_ref}/{pr_number}/{head_sha}/loss-graph.png" + + +def write_md_file( + output_file: Path, url: str, pr_number: str, head_sha: str, origin_repository: str +): + commit_url = f"https://github.com/{origin_repository}/commit/{head_sha}" + md_template = f""" +# Loss Graph for PR {args.pr_number} ([{args.head_sha[:7]}]({commit_url})) + +![Loss Graph]({url}) +""" + output_file.write_text(md_template, encoding="utf-8") + + +def get_url(bucket_name: str, destination: str, aws_region: str) -> str: + return f"https://{bucket_name}.s3.{aws_region}.amazonaws.com/{destination}" + + +def main(args: Arguments): + # first things first, we create the png file to upload to S3 + log_file = Path(args.log_file) + loss_data = read_loss_data(log_file=log_file) + output_image = Path("/tmp/loss-graph.png") + output_file = Path(args.output_file) + render_image(loss_data=loss_data, outfile=output_image) + destination_path = get_destination_path( + base_ref=args.base_branch, pr_number=args.pr_number, head_sha=args.head_sha + ) + write_to_s3( + file=output_image, bucket_name=args.bucket_name, destination=destination_path + ) + s3_url = get_url( + bucket_name=args.bucket_name, + destination=destination_path, + aws_region=args.aws_region, + ) + write_md_file( + output_file=output_file, + url=s3_url, + pr_number=args.pr_number, + head_sha=args.head_sha, + origin_repository=args.origin_repository, + ) + print(f"Loss graph uploaded to '{s3_url}'") + print(f"Markdown file written to '{output_file}'") + + +if __name__ == "__main__": + parser = ArgumentParser() + + parser.add_argument( + "--log-file", + type=str, + required=True, + help="The log file to read the loss data from.", + ) + parser.add_argument( + "--output-file", + type=str, + required=True, + help="The output file where the resulting markdown will be written.", + ) + parser.add_argument( + "--aws-region", + type=str, + required=True, + help="S3 region to which the bucket belongs.", + ) + parser.add_argument( + "--bucket-name", type=str, required=True, help="The S3 bucket name" + ) + parser.add_argument( + "--base-branch", + type=str, + required=True, + help="The base branch being merged to.", + ) + parser.add_argument("--pr-number", type=str, required=True, help="The PR number") + parser.add_argument( + "--head-sha", type=str, required=True, help="The head SHA of the PR" + ) + parser.add_argument( + "--origin-repository", + type=str, + required=True, + help="The repository to which the originating branch belongs to.", + ) + + args = parser.parse_args() + + arguments = Arguments( + log_file=args.log_file, + output_file=args.output_file, + aws_region=args.aws_region, + bucket_name=args.bucket_name, + base_branch=args.base_branch, + pr_number=args.pr_number, + head_sha=args.head_sha, + origin_repository=args.origin_repository, + ) + main(arguments)