Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

e2e: replace old small job with new medium job (backport #277) #283

Merged
merged 1 commit into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,18 @@ pull_request_rules:
# e2e workflow
- or:
- and:
# note this should match the triggering criteria in 'e2e-nvidia-t4-x1.yml'
- check-success=e2e-workflow-complete
# note this should match the triggering criteria in 'e2e-nvidia-a10g-x1.yml'
- check-success=e2e-medium-workflow-complete
- or:
- files~=\.py$
- files=pyproject.toml
- files~=^requirements.*\.txt$
- files=.github/workflows/e2e-nvidia-t4-x1.yml
- files=.github/workflows/e2e-nvidia-a10g-x1.yml
- and:
- -files~=\.py$
- -files=pyproject.toml
- -files~=^requirements.*\.txt$
- -files=.github/workflows/e2e-nvidia-t4-x1.yml
- -files=.github/workflows/e2e-nvidia-a10g-x1.yml

# code lint workflow
- or:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
# SPDX-License-Identifier: Apache-2.0

name: E2E (NVIDIA Tesla T4 x1)
name: E2E (NVIDIA A10G x1)

on:
# run against every merge commit to 'main' and release branches
push:
branches:
- main
- release-*
# only run on PRs that touch certain regex paths
pull_request_target:
types:
- opened
- synchronize
- reopened
branches:
- main
- release-*
Expand All @@ -20,15 +18,24 @@ on:
- '**.py'
- 'pyproject.toml'
- 'requirements**.txt'
- '.github/workflows/e2e-nvidia-t4-x1.yml' # Follow-on workflow
- '.github/workflows/e2e-nvidia-a10g-x1.yml' # This workflow

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

env:
LC_ALL: en_US.UTF-8

defaults:
run:
shell: bash

permissions:
contents: read

jobs:
start-runner:
name: Start external EC2 runner
start-medium-ec2-runner:
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
Expand All @@ -40,6 +47,7 @@ jobs:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}

- name: Start EC2 runner
id: start-ec2-runner
uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
Expand All @@ -59,23 +67,21 @@ jobs:
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
]

e2e:
name: E2E Test
needs: start-runner
runs-on: ${{ needs.start-runner.outputs.label }}
e2e-medium-test:
needs:
- start-medium-ec2-runner
runs-on: ${{ needs.start-medium-ec2-runner.outputs.label }}

# It is important that this job has no write permissions and has
# no access to any secrets. This part (e2e) is where we are running
# untrusted code from PRs.
permissions: {}

steps:
# for debugging
- name: Print environment state
- name: Install Packages
run: |
echo "Current Working Directory: $PWD"
echo "Files in Local Directory:"
ls -l
cat /etc/os-release
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel

- name: Checkout instructlab/instructlab
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
Expand All @@ -93,30 +99,19 @@ jobs:
# https://github.com/actions/checkout/issues/249
fetch-depth: 0

# for debugging
- name: Print environment state
run: |
echo "Current Working Directory: $PWD"
echo "Files in Local Directory:"
ls -l

- name: Fetch and checkout PR
id: fetch_pr
if: github.event_name == 'pull_request_target'
if: ${{ github.event_name == 'pull_request_target' }}
working-directory: ./training
run: |
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }}
git checkout pr-${{ github.event.pull_request.number }}

- name: Install system packages
run: |
cat /etc/os-release
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel

- name: Install instructlab
- name: Install ilab
working-directory: ./instructlab
run: |
export PATH="/home/ec2-user/.local/bin:/usr/local/cuda/bin:$PATH"
export CUDA_HOME="/usr/local/cuda"
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
export PATH="$PATH:$CUDA_HOME/bin"
python3.11 -m venv --upgrade-deps venv
. venv/bin/activate
nvidia-smi
Expand All @@ -127,7 +122,7 @@ jobs:
# https://github.com/instructlab/instructlab/issues/1821
# install with Torch and build dependencies installed
python3.11 -m pip install packaging wheel setuptools-scm
python3.11 -m pip install .[cuda]
python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt

- name: Update instructlab-training library
working-directory: ./training
Expand All @@ -136,17 +131,21 @@ jobs:
pip install .
pip install .[cuda]

- name: Check disk
run: |
df -h

- name: Run e2e test
working-directory: ./instructlab
run: |
. venv/bin/activate
./scripts/basic-workflow-tests.sh -a
./scripts/e2e-ci.sh -m

stop-runner:
stop-medium-ec2-runner:
name: Stop external EC2 runner
needs:
- start-runner
- e2e
- start-medium-ec2-runner
- e2e-medium-test
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
Expand All @@ -161,13 +160,13 @@ jobs:
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
label: ${{ needs.start-medium-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}

e2e-workflow-complete:
e2e-medium-workflow-complete:
# we don't want to block PRs on failed EC2 cleanup
# so not requiring "stop-runner" as well
needs: ["start-runner", "e2e"]
needs: ["start-medium-ec2-runner", "e2e-medium-test"]
runs-on: ubuntu-latest
steps:
- name: E2E Workflow Complete
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# InstructLab Training Library

![Lint](https://github.com/instructlab/training/actions/workflows/lint.yml/badge.svg?branch=main)
![`e2e-nvidia-a10g-x1.yaml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main)
![Build](https://github.com/instructlab/training/actions/workflows/pypi.yaml/badge.svg?branch=main)
![Release](https://img.shields.io/github/v/release/instructlab/training)
![License](https://img.shields.io/github/license/instructlab/training)
Expand Down
Loading