From d912c2cc2fa8a41284851944fe4b2066fb85aef9 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Fri, 25 Oct 2024 10:41:49 -0400
Subject: [PATCH] ci: convert med E2E CI job to L4 GPU

also adds '-v' to 'pip install' so we can see
environmental variable info for debugging issues
related to installation

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/mergify.yml                              |  6 +++---
 ...e-nvidia-a10g-x1.yml => e2e-nvidia-l4-x1.yml} | 16 ++++++++--------
 README.md                                        |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)
 rename .github/workflows/{e2e-nvidia-a10g-x1.yml => e2e-nvidia-l4-x1.yml} (94%)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 404565f2..e9b9f510 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -28,18 +28,18 @@ pull_request_rules:
     # e2e medium workflow
     - or:
       - and:
-        # note this should match the triggering criteria in 'e2e-nvidia-a10g-x1.yml'
+        # note this should match the triggering criteria in 'e2e-nvidia-l4-x1.yml'
         - check-success~=e2e-medium-workflow-complete
         - or:
           - files~=\.py$
           - files=pyproject.toml
           - files~=^requirements.*\.txt$
-          - files=.github/workflows/e2e-nvidia-a10g-x1.yml
+          - files=.github/workflows/e2e-nvidia-l4-x1.yml
       - and:
         - -files~=\.py$
         - -files=pyproject.toml
         - -files~=^requirements.*\.txt$
-        - -files=.github/workflows/e2e-nvidia-a10g-x1.yml
+        - -files=.github/workflows/e2e-nvidia-l4-x1.yml
 
     # code lint workflow
     - or:
diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
similarity index 94%
rename from .github/workflows/e2e-nvidia-a10g-x1.yml
rename to .github/workflows/e2e-nvidia-l4-x1.yml
index f145ebd8..5453ff3a 100644
--- a/.github/workflows/e2e-nvidia-a10g-x1.yml
+++ b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-name: E2E (NVIDIA A10G x1)
+name: E2E (NVIDIA L4 x1)
 
 on:
   # run against every merge commit to 'main' and release branches
@@ -18,7 +18,7 @@ on:
       - '**.py'
       - 'pyproject.toml'
       - 'requirements**.txt'
-      - '.github/workflows/e2e-nvidia-a10g-x1.yml' # This workflow
+      - '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -55,7 +55,7 @@ jobs:
           mode: start
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
           ec2-image-id: ${{ vars.AWS_EC2_AMI }}
-          ec2-instance-type: g5.4xlarge
+          ec2-instance-type: g6.8xlarge
           subnet-id: subnet-02d230cffd9385bd4
           security-group-id: sg-06300447c4a5fbef3
           iam-role-name: instructlab-ci-runner
@@ -117,19 +117,19 @@ jobs:
           nvidia-smi
           python3.11 -m pip cache remove llama_cpp_python
 
-          CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install .
+          CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install -v .
 
           # https://github.com/instructlab/instructlab/issues/1821
           # install with Torch and build dependencies installed
-          python3.11 -m pip install packaging wheel setuptools-scm
-          python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt
+          python3.11 -m pip install -v packaging wheel setuptools-scm
+          python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt
 
       - name: Update instructlab-training library
         working-directory: ./training
         run: |
           . ../instructlab/venv/bin/activate
-          pip install .
-          pip install .[cuda]
+          pip install -v .
+          pip install -v .[cuda]
 
       - name: Check disk
         run: |
diff --git a/README.md b/README.md
index be396a64..27219af9 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 ![Release](https://img.shields.io/github/v/release/instructlab/training)
 ![License](https://img.shields.io/github/license/instructlab/training)
 
-![`e2e-nvidia-a10g-x1.yml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main)
+![`e2e-nvidia-l4-x1.yml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-l4-x1.yml/badge.svg?branch=main)
 ![`e2e-nvidia-l40s-x4.yml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-l40s-x4.yml/badge.svg?branch=main)
 
 - [Installing](#installing-the-library)