From 8985eb5b83d86fc2f76fdcbfa75860403f3a35cb Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Thu, 4 Apr 2024 11:40:19 -0700 Subject: [PATCH] Update workflows that use cu116 to cu117 (#5361) The following workflows were specifying runners with cu116, we are updating these to cu117. Workflows impacted: - [x] nv-accelerate-v100 - [new build](https://github.com/microsoft/DeepSpeed/actions/runs/8557768042/job/23450811816?pr=5361): 22 passed, 5 skipped, 11 warnings in 129.04s (0:02:09) - [old build](https://github.com/microsoft/DeepSpeed/actions/runs/8547131990/job/23418750315): 22 passed, 5 skipped, 11 warnings in 318.84s (0:05:18) - [x] nv-ds-chat - [new build](https://github.com/microsoft/DeepSpeed/actions/runs/8546543733/job/23417119129): 15 passed, 1 skipped in 2729.91s (0:45:29) - [old build](https://github.com/microsoft/DeepSpeed/actions/runs/8531148226/job/23370268262): 15 passed, 1 skipped in 3511.82s (0:58:31) - [x] nv-inference - recently failing and disabled, needs fixes. - [new build](https://github.com/microsoft/DeepSpeed/actions/runs/8558749560): 36 failed, 74 passed, 95 skipped, 4 warnings in 877.45s (0:14:37) - [old build](https://github.com/microsoft/DeepSpeed/actions/runs/8546382497/job/23416626521): 36 failed, 74 passed, 95 skipped, 4 warnings in 3633.34s (1:00:33) - [x] nv-mii - [new build](https://github.com/microsoft/DeepSpeed/actions/runs/8557768075/job/23450812054?pr=5361): 4 passed, 23 deselected, 3 warnings in 116.28s (0:01:56) - [old build](https://github.com/microsoft/DeepSpeed/actions/runs/8547246351/job/23419064526): 4 passed, 23 deselected, 3 warnings in 196.79s (0:03:16) - [x] nv-nightly - [new build](https://github.com/microsoft/DeepSpeed/actions/runs/8557763671/job/23450792634): 3 passed, 3 skipped, 4713 deselected, 1 warning in 1831.83s (0:30:31) - [old build](https://github.com/microsoft/DeepSpeed/actions/runs/8547230983/job/23419020962): 3 passed, 3 skipped, 4713 deselected, 1 warning in 2459.06s (0:40:59) - [x] nv-torch-latest-v100 - [new build](https://github.com/microsoft/DeepSpeed/actions/runs/8557768039/job/23450811779): 947 passed, 169 skipped, 4 warnings in 2550.25s (0:42:30) and 61 passed, 4 skipped, 4643 deselected, 1 warning in 563.34s (0:09:23) - [old build](https://github.com/microsoft/DeepSpeed/actions/runs/8547232496/job/23419024966): 947 passed, 169 skipped, 4 warnings in 3216.47s (0:53:36) and 61 passed, 4 skipped, 4643 deselected, 1 warning in 611.17s (0:10:11) - [x] nv-torch-nightly-v100 - [new build](https://github.com/microsoft/DeepSpeed/actions/runs/8558930744): 13 failed, 982 passed, 121 skipped, 4 warnings in 2691.26s (0:44:51) - [old build](https://github.com/microsoft/DeepSpeed/actions/runs/8558895638): 13 failed, 982 passed, 121 skipped, 4 warnings in 3117.03s (0:51:57) - [x] nv-transformers-v100 - disabled for 4 months, needs work regardless. --- .github/workflows/nv-accelerate-v100.yml | 2 +- .github/workflows/nv-ds-chat.yml | 2 +- .github/workflows/nv-inference.yml | 2 +- .github/workflows/nv-mii.yml | 2 +- .github/workflows/nv-nightly.yml | 6 +++--- .github/workflows/nv-torch-latest-v100.yml | 2 +- .github/workflows/nv-torch-nightly-v100.yml | 2 +- .github/workflows/nv-transformers-v100.yml | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml index 93286b62610a..1fccbece2994 100644 --- a/.github/workflows/nv-accelerate-v100.yml +++ b/.github/workflows/nv-accelerate-v100.yml @@ -19,7 +19,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml index 61011a85b92c..f61637be7e0e 100644 --- a/.github/workflows/nv-ds-chat.yml +++ b/.github/workflows/nv-ds-chat.yml @@ -21,7 +21,7 @@ permissions: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml index 2b74e7e155df..6b339f457802 100644 --- a/.github/workflows/nv-inference.yml +++ b/.github/workflows/nv-inference.yml @@ -22,7 +22,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml index 0b3f128be5a4..31379f7e758b 100644 --- a/.github/workflows/nv-mii.yml +++ b/.github/workflows/nv-mii.yml @@ -27,7 +27,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml index e540b5acaf33..ca091990cf4b 100644 --- a/.github/workflows/nv-nightly.yml +++ b/.github/workflows/nv-nightly.yml @@ -15,7 +15,7 @@ permissions: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - uses: actions/checkout@v3 @@ -25,7 +25,7 @@ jobs: - name: Install pytorch run: | - pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --index-url https://download.pytorch.org/whl/cu116 + pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --index-url https://download.pytorch.org/whl/cu117 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -55,7 +55,7 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6" + pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.7" - name: Open GitHub issue if nightly CI fails if: ${{ failure() && (github.event_name == 'schedule') }} diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml index e2d0f172dcbf..14d33680521d 100644 --- a/.github/workflows/nv-torch-latest-v100.yml +++ b/.github/workflows/nv-torch-latest-v100.yml @@ -19,7 +19,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml index f46c5089b241..bd13047f6078 100644 --- a/.github/workflows/nv-torch-nightly-v100.yml +++ b/.github/workflows/nv-torch-nightly-v100.yml @@ -15,7 +15,7 @@ permissions: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml index 4fbc42abec5f..75f53c95c235 100644 --- a/.github/workflows/nv-transformers-v100.yml +++ b/.github/workflows/nv-transformers-v100.yml @@ -18,7 +18,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - uses: actions/checkout@v3