Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds smoke test workflow and tests #424

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions .github/workflows/e2e-nvidia-l4-x1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ on:
- release-*
paths:
# note this should match the merging criteria in 'mergify.yml'
- '**.py'
- 'pyproject.toml'
- 'requirements**.txt'
- '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow
- "**.py"
- "pyproject.toml"
- "requirements**.txt"
- ".github/workflows/e2e-nvidia-l4-x1.yml" # This workflow
- "!tests/**" # we don't need to run e2e if we're just changing the tests.

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
Expand Down Expand Up @@ -72,7 +73,7 @@ jobs:
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
]

e2e-medium-test:
needs:
- start-medium-ec2-runner
Expand Down Expand Up @@ -156,7 +157,7 @@ jobs:
. venv/bin/activate
# set preserve to true so we can retain the logs
./scripts/e2e-ci.sh -mp

# HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library.
# Therefore we must disable the upload of the training logs, as they will not exist in the same location.
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
Expand Down Expand Up @@ -203,7 +204,7 @@ jobs:
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-medium-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}

# - name: Download loss data
# id: download-logs
# uses: actions/download-artifact@v4
Expand All @@ -214,12 +215,12 @@ jobs:
# - name: Install dependencies
# run: |
# pip install -r requirements-dev.txt

# - name: Try to upload to s3
# id: upload-s3
# continue-on-error: true
# run: |
# output_file='./test.md'
# output_file='./test.md'
# python scripts/create-loss-graph.py \
# --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
# --output-file "${output_file}" \
Expand Down
143 changes: 143 additions & 0 deletions .github/workflows/smoke-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# SPDX-License-Identifier: Apache-2.0

name: "Run smoke tests via Tox::pytest"
# These tests will be long running and require accelerated hardware.
# They will help to verify that the library is *functionally* correct but
# will not try to verify that the libary is *correct*.

on:
# TEMP - only runs when manually invoked
# and only runs against branches in the repo.
workflow_dispatch:
inputs:
branch:
type: string
default: main

permissions:
contents: read

defaults:
run:
shell: bash

env:
ec2_runner_variant: "g6e.12xlarge" # 4x L40s

jobs:
start-ec2-runner:
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id}}

steps:
- name: "Harden runner"
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
with:
egress-policy: audit

- name: "Configure AWS credentials"
uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ vars.AWS_REGION }}

- name: "Start EC2 runner"
id: start-ec2-runner
uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ${{ vars.AWS_EC2_AMI }}
ec2-instance-type: ${{ env.ec2_runner_variant }}
subnet-id: subnet-024298cefa3bedd61
security-group-id: sg-06300447c4a5fbef3
iam-role-name: instructlab-ci-runner
aws-resource-tags: >
[
{"Key": "Name", "Value": "instructlab-ci-github-smoketest-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
]

run-smoke-tests:
needs:
- start-ec2-runner
runs-on: ${{needs.start-ec2-runner.outputs.label}}
# It is important that this job has no write permissions and has
# no access to any secrets. This part is where we are running
# untrusted code from PRs.
permissions: {}
steps:
- name: "Harden runner"
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
with:
egress-policy: audit

- name: "Install packages"
run: |
cat /etc/os-release
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel

- name: "Verify cuda environment is setup"
run: |
export CUDA_HOME="/usr/local/cuda"
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
export PATH="$PATH:$CUDA_HOME/bin"
nvidia-smi

- name: "Checkout code"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
ref: ${{inputs.branch}}

# installs in $GITHUB_WORKSPACE/venv.
# only has to install Tox because Tox will do the other virtual environment management.
- name: "Setup Python virtual environment"
run: |
python3.11 -m venv --upgrade-deps venv
. venv/bin/activate
pip install tox

- name: "Show disk utilization BEFORE tests"
run: |
df -h

- name: "Run unit tests with Tox and Pytest"
run: |
source venv/bin/activate
tox -e py3-smoke

- name: "Show disk utilization AFTER tests"
run: |
df -h

stop-ec2-runner:
needs:
- start-ec2-runner
- run-smoke-tests
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
- name: "Harden runner"
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
with:
egress-policy: audit

- name: "Configure AWS credentials"
uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ vars.AWS_REGION }}

- name: "Stop EC2 runner"
uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-ec2-runner.outputs.ec2-instance-id }}
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,8 @@ exclude = [
]
# honor excludes by not following there through imports
follow_imports = "silent"

[tool.pytest.ini_options]
markers = [
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
]
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ ipython
ipykernel
jupyter

huggingface_hub
Loading