Skip to content

Commit

Permalink
Refactor of XPK
Browse files Browse the repository at this point in the history
Obliviour committed Apr 11, 2024

Unverified

This commit is not signed, but one or more authors requires that any commit attributed to them is signed.
1 parent eff0dd0 commit 5fdcfcf
Showing 4 changed files with 352 additions and 273 deletions.
8 changes: 1 addition & 7 deletions .github/workflows/build_tests.yaml
Original file line number Diff line number Diff line change
@@ -26,7 +26,7 @@ env:
PATHWAYS_WORKLOAD_NAME: xpkpw-build-${{ github.run_attempt }}

jobs:
cluster-create-and-delete:
tpu-cluster-workload-workflow:
runs-on: [ubuntu-20.04]
concurrency: # We support one build or nightly test to run at a time currently.
group: build-test-cluster-group
@@ -70,9 +70,3 @@ jobs:
- name: Delete the cluster created
if: always()
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b






143 changes: 105 additions & 38 deletions .github/workflows/nightly_tests.yaml
Original file line number Diff line number Diff line change
@@ -22,15 +22,15 @@ on:
env:
# Names must be unique in parallel running tests.
EMPTY_CLUSTER_NAME: nightly-xpk-zero-nodepools
TPU_CLUSTER_NAME: nightly-xpk-2-v4-8-nodepools
WORKLOAD_NAME: xpktest-nightly-${{ github.run_attempt }}
TPU_CLUSTER_NAME: nightly-xpk-2-v4-8-nodepools
PATHWAYS_TPU_CLUSTER_NAME: pw-nightly-test-2-v4-8-nodepools
AUTOPROVISION_CLUSTER_NAME: autoprovision-nightly-test
WORKLOAD_NAME: xpktest-nightly-${{ github.run_attempt }}
PATHWAYS_WORKLOAD_NAME: xpkpw-nightly-${{ github.run_attempt }}

jobs:
cluster-create-and-delete:
tpu-cluster-workload-workflow:
runs-on: [ubuntu-20.04]
concurrency: # We support one build test to run at a time currently.
concurrency: # We support one build per job to run at a time currently.
group: nightly-test-cluster-group
cancel-in-progress: false
steps:
@@ -71,41 +71,108 @@ jobs:
- name: Delete the cluster created
if: always()
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b

command-help-test:
runs-on: [ubuntu-20.04]
concurrency: # We support one build test to run at a time currently.
group: nightly-command-help-test-cluster-group
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: Verify gcp setup
run: gcloud info
- name: XPK Help
run: python3 xpk.py --help
- name: XPK Cluster Help
run: python3 xpk.py cluster --help
- name: XPK Cluster Create Help
run: python3 xpk.py cluster create --help
- name: XPK Cluster Delete Help
run: python3 xpk.py cluster delete --help
- name: XPK Cluster Describe Help
run: python3 xpk.py cluster describe --help
- name: XPK Workload Help
run: python3 xpk.py workload --help
- name: XPK Workload Create Help
run: python3 xpk.py workload create --help
- name: XPK Workload Delete Help
run: python3 xpk.py workload delete --help
- name: XPK Workload List Help
run: python3 xpk.py workload list --help
- name: XPK Inspector Help
run: python3 xpk.py inspector list --help
xpk-tpu-autoprovisioning-test:
runs-on: [ubuntu-20.04]
concurrency: # We support one build test to run at a time currently.
group: nightly-autoprovisioning-test-cluster-group
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: Create an autoprovisioned enabled XPK Cluster with 2 x v4-8 nodepools
run: python xpk.py cluster create --cluster $AUTOPROVISION_CLUSTER_NAME --enable-autoprovisioning --device-type=v4-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > test.sh
- name: Run a 2x v4-8 workload on Ubuntu base image
run: python xpk.py workload create --cluster $AUTOPROVISION_CLUSTER_NAME --workload $WORKLOAD_NAME --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --command "bash test.sh"
- name: Wait for 2x v4-8 workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $AUTOPROVISION_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
- name: Run a 1x v4-16 workload
run: python xpk.py workload create --cluster $AUTOPROVISION_CLUSTER_NAME --workload ${WORKLOAD_NAME}-v4-16 --tpu-type=v4-16 --num-slices=1 --zone=us-central2-b --command "bash test.sh"
- name: Wait for 2x v4-8 workload completion and confirm it succeeded. Give 20 minutes to allow the node pools to re-provision.
run: python3 xpk.py workload list --cluster $AUTOPROVISION_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion ${WORKLOAD_NAME}-v4-16 --timeout 1200
- name: Delete the 2x v4-8 workload on the cluster
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $AUTOPROVISION_CLUSTER_NAME --zone=us-central2-b
- name: Delete the 1x v4-16 workload on the cluster
run: python3 xpk.py workload delete --workload ${WORKLOAD_NAME}-v4-16 --cluster $AUTOPROVISION_CLUSTER_NAME --zone=us-central2-b
- name: Delete the Pathways cluster created
if: always()
run: python xpk.py cluster delete --cluster $AUTOPROVISION_CLUSTER_NAME --zone=us-central2-b
pw-cluster-and-workload:
runs-on: [ubuntu-20.04]
concurrency: # We support one build test to run at a time currently.
group: nightly-pw-test-cluster-group
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: Create an Pathways-enabled XPK Cluster with 2 x v4-8 nodepools
run: python xpk.py cluster create --cluster $PATHWAYS_TPU_CLUSTER_NAME --device-type=v4-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}'
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > test.sh
- name: Run a Pathways workload on Ubuntu base image
run: python xpk.py workload create --cluster $PATHWAYS_TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --command "bash test.sh"
- name: Wait for Pathways workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
- name: Delete the Pathways workload on the cluster
run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=us-central2-b
- name: Delete the Pathways cluster created
if: always()
run: python xpk.py cluster delete --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=us-central2-b







steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: Create an Pathways-enabled XPK Cluster with 2 x v4-8 nodepools
run: python xpk.py cluster create --cluster $PATHWAYS_TPU_CLUSTER_NAME --device-type=v4-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}'
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > test.sh
- name: Run a Pathways workload on Ubuntu base image
run: python xpk.py workload create --cluster $PATHWAYS_TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --command "bash test.sh"
- name: Wait for Pathways workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
- name: Delete the Pathways workload on the cluster
run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=us-central2-b
- name: Delete the Pathways cluster created
if: always()
run: python xpk.py cluster delete --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=us-central2-b
26 changes: 26 additions & 0 deletions .github/workflows/pylint_black.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: psf black lint

on: [push, pull_request]

jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: psf/black@stable
with:
options: "--check --verbose"
448 changes: 220 additions & 228 deletions xpk.py

Large diffs are not rendered by default.

0 comments on commit 5fdcfcf

Please sign in to comment.