instructlab · JamesKunstle · Jan 11, 2025 · Jan 25, 2025
diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -15,10 +15,11 @@ on:
       - release-*
     paths:
       # note this should match the merging criteria in 'mergify.yml'
-      - '**.py'
-      - 'pyproject.toml'
-      - 'requirements**.txt'
-      - '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow
+      - "**.py"
+      - "pyproject.toml"
+      - "requirements**.txt"
+      - ".github/workflows/e2e-nvidia-l4-x1.yml" # This workflow
+      - "!tests/**" # we don't need to run e2e if we're just changing the tests.
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -72,7 +73,7 @@ jobs:
               {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
               {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
             ]
-  
+
   e2e-medium-test:
     needs:
       - start-medium-ec2-runner
@@ -156,7 +157,7 @@ jobs:
           . venv/bin/activate
           # set preserve to true so we can retain the logs
           ./scripts/e2e-ci.sh -mp
-          
+
           # HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library.
           #                Therefore we must disable the upload of the training logs, as they will not exist in the same location.
           # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
@@ -203,7 +204,7 @@ jobs:
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
           label: ${{ needs.start-medium-ec2-runner.outputs.label }}
           ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}
-      
+
       # - name: Download loss data
       #   id: download-logs
       #   uses: actions/download-artifact@v4
@@ -214,12 +215,12 @@ jobs:
       # - name: Install dependencies
       #   run: |
       #     pip install -r requirements-dev.txt
-      
+
       # - name: Try to upload to s3
       #   id: upload-s3
       #   continue-on-error: true
       #   run: |
-      #     output_file='./test.md' 
+      #     output_file='./test.md'
       #     python scripts/create-loss-graph.py  \
       #       --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
       #       --output-file "${output_file}" \

diff --git a/.github/workflows/smoke-tests.yaml b/.github/workflows/smoke-tests.yaml
@@ -0,0 +1,143 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: "Run smoke tests via Tox::pytest"
+# These tests will be long running and require accelerated hardware.
+# They will help to verify that the library is *functionally* correct but
+# will not try to verify that the libary is *correct*.
+
+on:
+  # TEMP - only runs when manually invoked
+  # and only runs against branches in the repo.
+  workflow_dispatch:
+    inputs:
+      branch:
+        type: string
+        default: main
+
+permissions:
+  contents: read
+
+defaults:
+  run:
+    shell: bash
+
+env:
+  ec2_runner_variant: "g6e.12xlarge" # 4x L40s
+
+jobs:
+  start-ec2-runner:
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id}}
+
+    steps:
+      - name: "Harden runner"
+        uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
+        with:
+          egress-policy: audit
+
+      - name: "Configure AWS credentials"
+        uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.AWS_REGION }}
+
+      - name: "Start EC2 runner"
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ${{ vars.AWS_EC2_AMI }}
+          ec2-instance-type: ${{ env.ec2_runner_variant }}
+          subnet-id: subnet-024298cefa3bedd61
+          security-group-id: sg-06300447c4a5fbef3
+          iam-role-name: instructlab-ci-runner
+          aws-resource-tags: >
+            [
+            {"Key": "Name", "Value": "instructlab-ci-github-smoketest-runner"},
+            {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
+            {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
+            ]
+
+  run-smoke-tests:
+    needs:
+      - start-ec2-runner
+    runs-on: ${{needs.start-ec2-runner.outputs.label}}
+    # It is important that this job has no write permissions and has
+    # no access to any secrets. This part is where we are running
+    # untrusted code from PRs.
+    permissions: {}
+    steps:
+      - name: "Harden runner"
+        uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
+        with:
+          egress-policy: audit
+
+      - name: "Install packages"
+        run: |
+          cat /etc/os-release
+          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
+
+      - name: "Verify cuda environment is setup"
+        run: |
+          export CUDA_HOME="/usr/local/cuda"
+          export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+          export PATH="$PATH:$CUDA_HOME/bin"
+          nvidia-smi
+
+      - name: "Checkout code"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+          ref: ${{inputs.branch}}
+
+      # installs in $GITHUB_WORKSPACE/venv.
+      # only has to install Tox because Tox will do the other virtual environment management.
+      - name: "Setup Python virtual environment"
+        run: |
+          python3.11 -m venv --upgrade-deps venv
+          . venv/bin/activate
+          pip install tox
+
+      - name: "Show disk utilization BEFORE tests"
+        run: |
+          df -h
+
+      - name: "Run unit tests with Tox and Pytest"
+        run: |
+          source venv/bin/activate
+          tox -e py3-smoke
+
+      - name: "Show disk utilization AFTER tests"
+        run: |
+          df -h
+
+  stop-ec2-runner:
+    needs:
+      - start-ec2-runner
+      - run-smoke-tests
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: "Harden runner"
+        uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
+        with:
+          egress-policy: audit
+
+      - name: "Configure AWS credentials"
+        uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.AWS_REGION }}
+
+      - name: "Stop EC2 runner"
+        uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-ec2-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-ec2-runner.outputs.ec2-instance-id }}
diff --git a/pyproject.toml b/pyproject.toml
@@ -107,3 +107,8 @@ exclude = [
 ]
 # honor excludes by not following there through imports
 follow_imports = "silent"
+
+[tool.pytest.ini_options]
+markers = [
+  "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+]
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -13,3 +13,4 @@ ipython
 ipykernel
 jupyter
 
+huggingface_hub
Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,3 +13,4 @@ ipython
		ipykernel
		jupyter

		huggingface_hub