From d912c2cc2fa8a41284851944fe4b2066fb85aef9 Mon Sep 17 00:00:00 2001 From: Nathan Weinberg Date: Fri, 25 Oct 2024 10:41:49 -0400 Subject: [PATCH] ci: convert med E2E CI job to L4 GPU also adds '-v' to 'pip install' so we can see environmental variable info for debugging issues related to installation Signed-off-by: Nathan Weinberg --- .github/mergify.yml | 6 +++--- ...e-nvidia-a10g-x1.yml => e2e-nvidia-l4-x1.yml} | 16 ++++++++-------- README.md | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) rename .github/workflows/{e2e-nvidia-a10g-x1.yml => e2e-nvidia-l4-x1.yml} (94%) diff --git a/.github/mergify.yml b/.github/mergify.yml index 404565f2..e9b9f510 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -28,18 +28,18 @@ pull_request_rules: # e2e medium workflow - or: - and: - # note this should match the triggering criteria in 'e2e-nvidia-a10g-x1.yml' + # note this should match the triggering criteria in 'e2e-nvidia-l4-x1.yml' - check-success~=e2e-medium-workflow-complete - or: - files~=\.py$ - files=pyproject.toml - files~=^requirements.*\.txt$ - - files=.github/workflows/e2e-nvidia-a10g-x1.yml + - files=.github/workflows/e2e-nvidia-l4-x1.yml - and: - -files~=\.py$ - -files=pyproject.toml - -files~=^requirements.*\.txt$ - - -files=.github/workflows/e2e-nvidia-a10g-x1.yml + - -files=.github/workflows/e2e-nvidia-l4-x1.yml # code lint workflow - or: diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml similarity index 94% rename from .github/workflows/e2e-nvidia-a10g-x1.yml rename to .github/workflows/e2e-nvidia-l4-x1.yml index f145ebd8..5453ff3a 100644 --- a/.github/workflows/e2e-nvidia-a10g-x1.yml +++ b/.github/workflows/e2e-nvidia-l4-x1.yml @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -name: E2E (NVIDIA A10G x1) +name: E2E (NVIDIA L4 x1) on: # run against every merge commit to 'main' and release branches @@ -18,7 +18,7 @@ on: - '**.py' - 'pyproject.toml' - 'requirements**.txt' - - '.github/workflows/e2e-nvidia-a10g-x1.yml' # This workflow + - '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -55,7 +55,7 @@ jobs: mode: start github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} ec2-image-id: ${{ vars.AWS_EC2_AMI }} - ec2-instance-type: g5.4xlarge + ec2-instance-type: g6.8xlarge subnet-id: subnet-02d230cffd9385bd4 security-group-id: sg-06300447c4a5fbef3 iam-role-name: instructlab-ci-runner @@ -117,19 +117,19 @@ jobs: nvidia-smi python3.11 -m pip cache remove llama_cpp_python - CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install . + CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install -v . # https://github.com/instructlab/instructlab/issues/1821 # install with Torch and build dependencies installed - python3.11 -m pip install packaging wheel setuptools-scm - python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt + python3.11 -m pip install -v packaging wheel setuptools-scm + python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt - name: Update instructlab-training library working-directory: ./training run: | . ../instructlab/venv/bin/activate - pip install . - pip install .[cuda] + pip install -v . + pip install -v .[cuda] - name: Check disk run: | diff --git a/README.md b/README.md index be396a64..27219af9 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ![Release](https://img.shields.io/github/v/release/instructlab/training) ![License](https://img.shields.io/github/license/instructlab/training) -![`e2e-nvidia-a10g-x1.yml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main) +![`e2e-nvidia-l4-x1.yml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-l4-x1.yml/badge.svg?branch=main) ![`e2e-nvidia-l40s-x4.yml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-l40s-x4.yml/badge.svg?branch=main) - [Installing](#installing-the-library)