feat: add log exporting to e2e tests

Currently, the training library runs through a series of end-to-end tests which ensure there are no bugs in the code being tested. However; we do not perform any form of validation to assure that the training logic and quality has not diminished. This presents an issue where we can potentially be "correct" in the sense of no hard errors being hit, but invisible bugs may be introduced which cause models to regress in training quality, or other bugs that plague the models themselves to seep in. This commit fixes that problem by introducng the ability to export the training loss data itself from the test and rendering the loss curve using matplotlib. Signed-off-by: Oleg S <[email protected]>
instructlab · Nov 13, 2024 · 039b743 · 039b743
1 parent 466474a
commit 039b743
Show file tree

Hide file tree

Showing 4 changed files with 324 additions and 2 deletions.
diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -41,6 +41,12 @@ jobs:
       label: ${{ steps.start-ec2-runner.outputs.label }}
       ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
     steps:
+      - name: "Harden Runner"
+        # v2.10.1
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7
+        with:
+          egress-policy: audit
+
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
         with:
@@ -78,6 +84,12 @@ jobs:
     permissions: {}
 
     steps:
+      - name: "Harden Runner"
+        # v2.10.1
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7
+        with:
+          egress-policy: audit
+
       - name: Install Packages
         run: |
           cat /etc/os-release
@@ -141,7 +153,22 @@ jobs:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           . venv/bin/activate
-          ./scripts/e2e-ci.sh -m
+          # set preserve to true so we can retain the logs
+          ./scripts/e2e-ci.sh -mp
+          
+          # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
+          # and we know that it will be written into a directory created by `mktemp -d`. 
+          # Given this information, we can use the following command to find the file:
+          log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
+          mv "${log_file}" training-log.jsonl
+
+      - name: Upload training logs
+        uses: actions/upload-artifact@v4
+        with:
+          name: training-log.jsonl
+          path: ./instructlab/training-log.jsonl
+          retention-days: 1
+          overwrite: true
 
   stop-medium-ec2-runner:
     needs:
@@ -150,6 +177,12 @@ jobs:
     runs-on: ubuntu-latest
     if: ${{ always() }}
     steps:
+      - name: "Harden Runner"
+        # v2.10.1
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7
+        with:
+          egress-policy: audit
+
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
         with:
@@ -164,6 +197,40 @@ jobs:
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
           label: ${{ needs.start-medium-ec2-runner.outputs.label }}
           ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}
+
+      - name: Download loss data
+        id: download-logs
+        uses: actions/download-artifact@v4
+        with:
+          name: training-log.jsonl
+          path: downloaded-data
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements-dev.txt
+      
+      - name: Try to upload to s3
+        id: upload-s3
+        continue-on-error: true
+        run: |
+          output_file='./test.md' 
+          python scripts/create-loss-graph.py  \
+            --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
+            --output-file "${output_file}" \
+            --aws-region "${{ vars.AWS_REGION }}" \
+            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
+            --base-branch "${{ github.event.pull_request.base.ref }}" \
+            --pr-number "${{ github.event.pull_request.number }}" \
+            --head-sha "${{ github.event.pull_request.head.sha }}" \
+            --origin-repository "${{ github.repository }}"
+
+          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check S3 upload status
+        if: steps.upload-s3.outcome == 'failure'
+        run: |
+          echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
+          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
 
   e2e-medium-workflow-complete:
     # we don't want to block PRs on failed EC2 cleanup

diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -19,6 +19,12 @@ jobs:
       label: ${{ steps.start-ec2-runner.outputs.label }}
       ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
     steps:
+      - name: "Harden Runner"
+        # v2.10.1
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7
+        with:
+          egress-policy: audit
+
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
         with:
@@ -54,6 +60,11 @@ jobs:
       pull-requests: write
 
     steps:
+      - name: "Harden Runner"
+        # v2.10.1
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7
+        with:
+          egress-policy: audit
       - name: Install Packages
         run: |
           cat /etc/os-release
@@ -170,7 +181,23 @@ jobs:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           . venv/bin/activate
-          ./scripts/e2e-ci.sh -l
+
+          # set preserve to true so we can retain the logs
+          ./scripts/e2e-ci.sh -lp
+
+          # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
+          # and we know that it will be written into a directory created by `mktemp -d`. 
+          # Given this information, we can use the following command to find the file:
+          log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
+          mv "${log_file}" training-log.jsonl
+
+      - name: Upload training logs
+        uses: actions/upload-artifact@v4
+        with:
+          name: training-log.jsonl
+          path: ./instructlab/training-log.jsonl
+          retention-days: 1
+          overwrite: true
 
       - name: Add comment to PR if the workflow failed
         if: failure() && steps.check_pr.outputs.is_pr == 'true'
@@ -221,6 +248,12 @@ jobs:
     runs-on: ubuntu-latest
     if: ${{ always() }}
     steps:
+      - name: "Harden Runner"
+        # v2.10.1
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7
+        with:
+          egress-policy: audit
+
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
         with:
@@ -235,3 +268,37 @@ jobs:
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
           label: ${{ needs.start-large-ec2-runner.outputs.label }}
           ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
+
+      - name: Download loss data
+        id: download-logs
+        uses: actions/download-artifact@v4
+        with:
+          name: training-log.jsonl
+          path: downloaded-data
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements-dev.txt
+      
+      - name: Try to upload to s3
+        id: upload-s3
+        continue-on-error: true
+        run: |
+          output_file='./test.md' 
+          python scripts/create-loss-graph.py  \
+            --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
+            --output-file "${output_file}" \
+            --aws-region "${{ vars.AWS_REGION }}" \
+            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
+            --base-branch "${{ github.event.pull_request.base.ref }}" \
+            --pr-number "${{ github.event.pull_request.number }}" \
+            --head-sha "${{ github.event.pull_request.head.sha }}" \
+            --origin-repository "${{ github.repository }}"
+
+      - name: Check S3 upload status
+        if: steps.upload-s3.outcome == 'failure'
+        run: |
+          echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
+          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
+  
+          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -2,6 +2,8 @@
 
 -r requirements.txt
 
+matplotlib
+numpy
 pre-commit>=3.0.4,<5.0
 pylint>=2.16.2,<4.0
 pylint-pydantic