diff --git a/.github/mergify.yml b/.github/mergify.yml index 774b6dd4..edd79057 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -28,18 +28,18 @@ pull_request_rules: # e2e workflow - or: - and: - # note this should match the triggering criteria in 'e2e-nvidia-t4-x1.yml' - - check-success=e2e-workflow-complete + # note this should match the triggering criteria in 'e2e-nvidia-a10g-x1.yml' + - check-success=e2e-medium-workflow-complete - or: - files~=\.py$ - files=pyproject.toml - files~=^requirements.*\.txt$ - - files=.github/workflows/e2e-nvidia-t4-x1.yml + - files=.github/workflows/e2e-nvidia-a10g-x1.yml - and: - -files~=\.py$ - -files=pyproject.toml - -files~=^requirements.*\.txt$ - - -files=.github/workflows/e2e-nvidia-t4-x1.yml + - -files=.github/workflows/e2e-nvidia-a10g-x1.yml # code lint workflow - or: diff --git a/.github/workflows/e2e-nvidia-t4-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml similarity index 79% rename from .github/workflows/e2e-nvidia-t4-x1.yml rename to .github/workflows/e2e-nvidia-a10g-x1.yml index 3a4607c2..e769c781 100644 --- a/.github/workflows/e2e-nvidia-t4-x1.yml +++ b/.github/workflows/e2e-nvidia-a10g-x1.yml @@ -1,17 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 -name: E2E (NVIDIA Tesla T4 x1) +name: E2E (NVIDIA A10G x1) on: + # run against every merge commit to 'main' and release branches push: branches: - main - release-* + # only run on PRs that touch certain regex paths pull_request_target: - types: - - opened - - synchronize - - reopened branches: - main - release-* @@ -20,15 +18,24 @@ on: - '**.py' - 'pyproject.toml' - 'requirements**.txt' - - '.github/workflows/e2e-nvidia-t4-x1.yml' # Follow-on workflow + - '.github/workflows/e2e-nvidia-a10g-x1.yml' # This workflow concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + jobs: - start-runner: - name: Start external EC2 runner + start-medium-ec2-runner: runs-on: ubuntu-latest outputs: label: ${{ steps.start-ec2-runner.outputs.label }} @@ -40,6 +47,7 @@ jobs: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: ${{ secrets.AWS_REGION }} + - name: Start EC2 runner id: start-ec2-runner uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 @@ -59,10 +67,10 @@ jobs: {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} ] - e2e: - name: E2E Test - needs: start-runner - runs-on: ${{ needs.start-runner.outputs.label }} + e2e-medium-test: + needs: + - start-medium-ec2-runner + runs-on: ${{ needs.start-medium-ec2-runner.outputs.label }} # It is important that this job has no write permissions and has # no access to any secrets. This part (e2e) is where we are running @@ -70,12 +78,10 @@ jobs: permissions: {} steps: - # for debugging - - name: Print environment state + - name: Install Packages run: | - echo "Current Working Directory: $PWD" - echo "Files in Local Directory:" - ls -l + cat /etc/os-release + sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel - name: Checkout instructlab/instructlab uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 @@ -93,30 +99,19 @@ jobs: # https://github.com/actions/checkout/issues/249 fetch-depth: 0 - # for debugging - - name: Print environment state - run: | - echo "Current Working Directory: $PWD" - echo "Files in Local Directory:" - ls -l - - name: Fetch and checkout PR - id: fetch_pr - if: github.event_name == 'pull_request_target' + if: ${{ github.event_name == 'pull_request_target' }} working-directory: ./training run: | git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }} git checkout pr-${{ github.event.pull_request.number }} - - name: Install system packages - run: | - cat /etc/os-release - sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel - - - name: Install instructlab + - name: Install ilab working-directory: ./instructlab run: | - export PATH="/home/ec2-user/.local/bin:/usr/local/cuda/bin:$PATH" + export CUDA_HOME="/usr/local/cuda" + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" + export PATH="$PATH:$CUDA_HOME/bin" python3.11 -m venv --upgrade-deps venv . venv/bin/activate nvidia-smi @@ -127,7 +122,7 @@ jobs: # https://github.com/instructlab/instructlab/issues/1821 # install with Torch and build dependencies installed python3.11 -m pip install packaging wheel setuptools-scm - python3.11 -m pip install .[cuda] + python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt - name: Update instructlab-training library working-directory: ./training @@ -136,17 +131,21 @@ jobs: pip install . pip install .[cuda] + - name: Check disk + run: | + df -h + - name: Run e2e test working-directory: ./instructlab run: | . venv/bin/activate - ./scripts/basic-workflow-tests.sh -a + ./scripts/e2e-ci.sh -m - stop-runner: + stop-medium-ec2-runner: name: Stop external EC2 runner needs: - - start-runner - - e2e + - start-medium-ec2-runner + - e2e-medium-test runs-on: ubuntu-latest if: ${{ always() }} steps: @@ -161,13 +160,13 @@ jobs: with: mode: stop github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - label: ${{ needs.start-runner.outputs.label }} - ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} + label: ${{ needs.start-medium-ec2-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }} - e2e-workflow-complete: + e2e-medium-workflow-complete: # we don't want to block PRs on failed EC2 cleanup # so not requiring "stop-runner" as well - needs: ["start-runner", "e2e"] + needs: ["start-medium-ec2-runner", "e2e-medium-test"] runs-on: ubuntu-latest steps: - name: E2E Workflow Complete diff --git a/README.md b/README.md index f57937e9..880afd5b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # InstructLab Training Library ![Lint](https://github.com/instructlab/training/actions/workflows/lint.yml/badge.svg?branch=main) +![`e2e-nvidia-a10g-x1.yaml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main) ![Build](https://github.com/instructlab/training/actions/workflows/pypi.yaml/badge.svg?branch=main) ![Release](https://img.shields.io/github/v/release/instructlab/training) ![License](https://img.shields.io/github/license/instructlab/training)