Skip to content

Fix development branch #2018

Fix development branch

Fix development branch #2018

Workflow file for this run

# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
name: Build Tests
on:
push:
branches: ["main"]
pull_request: # By default this runs for types assigned, opened and synchronize.
env:
# Names must be unique in parallel running tests.
TPU_CLUSTER_NAME: build-xpk-2-v4-8-nodepools
TPU_FILESTORE_CLUSTER_NAME: build-xpk-filestore
WORKLOAD_NAME: xpktest-build-${{ github.run_attempt }}
PATHWAYS_WORKLOAD_NAME: xpkpw-build-${{ github.run_attempt }}
STORAGE_NAME: test-storage
PW_CLUSTER_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --maintenance-window=23:50"
CLUSTER_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}} --maintenance-window=23:50"
FS_STORAGE_NAME: ${{secrets.INSTANCE_NAME}}-test-storage
FS_DELETE_WORKLOAD: "fs-delete-workload"
FS_READ_WORKLOAD: "fs-read-workload"
FS_WRITE_WORKLOAD: "fs-write-workload"
jobs:
run-filestore-workload:
runs-on: [ubuntu-20.04]
concurrency: # We support one build or nightly test to run at a time currently.
group: filestore-test-group
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: Generate random seed
run: |
RANDOM_SEED=$((RANDOM % 10000)) # Generate a random number between 0 and 9999
echo "RANDOM_SEED=$RANDOM_SEED" >> $GITHUB_ENV
- name: Verify gcp setup
run: gcloud info
- name: Set Google Cloud CLI properties to a unused zone to verify --zone arg is passed properly in commands.
run: |
gcloud config set compute/zone us-east4-a
gcloud config get compute/zone
- name: Install xpk with pip and verify it executes corretly
run: |
echo foo
pip install .
xpk --help
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Create a XPK Cluster with 2x v4-8 nodepools. Larger num-nodes to avoid master resizing.
run: |
python3 xpk.py cluster create --cluster $TPU_FILESTORE_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 \
--zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 \
--reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --enable-workload-identity --enable-gcpfilestore-csi-driver --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
- name: Fill Filestore manifest file
run: |
sed -i 's/PROJECT_NAME/${{secrets.PROJECT_NAME}}/g; s/ZONE/us-central2-a/g; s/INSTANCE_NAME/${{secrets.INSTANCE_NAME}}/g; s/VOL_NAME/${{secrets.VOL_NAME}}/g; s/IP_ADDRESS/${{secrets.IP_ADDRESS}}/g' ./tests/data/fs-manifest.yaml
- name: Create auto-mount GCP Filestore Storage instance
run: |
python3 xpk.py storage create $FS_STORAGE_NAME --cluster=$TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b --type=gcpfilestore \
--auto-mount=true \
--mount-point='/fs-test-mount-point' --readonly=false --manifest='./tests/data/fs-manifest.yaml'
- name: List and verify existing Storages
run: python3 xpk.py storage list --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b | tee output.txt | grep $FS_STORAGE_NAME || (echo 'No storage found' && exit 143)
- name: Run workload to write file on filestore
run : python3 xpk.py workload create --workload $FS_WRITE_WORKLOAD --command "mkdir /fs-test-mount-point/$RANDOM_SEED/ && echo 'Test text message' > /fs-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Writing to filestore failed' && exit 143)" --cluster $TPU_FILESTORE_CLUSTER_NAME --tpu-type=v4-8 --zone us-central2
- name: Wait for writer workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $FS_WRITE_WORKLOAD --timeout 300
- name: Run workload to read file on filestore
run : python3 xpk.py workload create --workload $FS_READ_WORKLOAD --command "cat /fs-test-mount-point/$RANDOM_SEED/test.txt | grep 'Test text message' || (echo 'Reading from filestore failed' && exit 143)" --cluster $TPU_FILESTORE_CLUSTER_NAME --tpu-type=v4-8 --zone us-central2
- name: Wait for reader workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $FS_READ_WORKLOAD --timeout 300
- name: Run workload to delete file on filestore
run : python3 xpk.py workload create --workload $FS_DELETE_WORKLOAD --command "rm -rf /fs-test-mount-point/$RANDOM_SEED/test.txt || exit 143" --cluster $TPU_FILESTORE_CLUSTER_NAME --tpu-type=v4-8 --zone us-central2
- name: Wait for delete workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $FS_DELETE_WORKLOAD --timeout 300
- name: Delete the writer workload on the cluster
if: always()
run: python3 xpk.py workload delete --workload $FS_WRITE_WORKLOAD --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b
- name: Delete the reader workload on the cluster
if: always()
run: python3 xpk.py workload delete --workload $FS_READ_WORKLOAD --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b
- name: Delete the delete workload on the cluster
if: always()
run: python3 xpk.py workload delete --workload $FS_DELETE_WORKLOAD --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b
- name: Delete the cluster created
if: always()
run: python3 xpk.py cluster delete --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b
cluster-create-and-delete:
runs-on: [ubuntu-20.04]
concurrency: # We support one build or nightly test to run at a time currently.
group: build-test-cluster-group
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: Generate random seed
run: |
RANDOM_SEED=$((RANDOM % 10000)) # Generate a random number between 0 and 9999
echo "RANDOM_SEED=$RANDOM_SEED" >> $GITHUB_ENV
- name: Verify gcp setup
run: gcloud info
- name: Set Google Cloud CLI properties to a unused zone to verify --zone arg is passed properly in commands.
run: |
gcloud config set compute/zone us-east4-a
gcloud config get compute/zone
- name: Install xpk with pip and verify it executes corretly
run: |
pip install .
xpk --help
- name: Create a Pathways-enabled XPK Cluster with 2x v4-8 nodepools. Larger num-nodes to avoid master resizing.
run: |
python3 xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 \
--zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 \
--reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --enable-workload-identity --enable-gcsfuse-csi-driver --enable-gcpfilestore-csi-driver --custom-cluster-arguments="${PW_CLUSTER_ARGUMENTS}"
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Create auto-mount Storage instance
run: |
python3 xpk.py storage create $STORAGE_NAME --cluster=$TPU_CLUSTER_NAME --zone=us-central2-b --type=gcsfuse \
--auto-mount=true \
--mount-point='/test-mount-point' --readonly=false --manifest='./tests/data/pv-pvc-templates.yaml'
- name: List and verify existing Storages
run: python3 xpk.py storage list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | tee output.txt | grep 'test-storage' || (echo 'No storage found' && cat output.txt && exit 1)
- name: Create test script to execute in workloads
run: |
echo -e \
'#!/bin/bash \n
echo "Hello world from a test script!"
cd ~/../test-mount-point && echo "Hello world from a Github Action CI/CD test script!" > '$RANDOM_SEED'.txt' \
> test.sh
- name: Run a base-docker-image workload
run: |
python3 xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash test.sh" \
--tpu-type=v4-8 --num-slices=2 --zone=us-central2-b
- name: Run xpk inspector with the workload created above
run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --workload $WORKLOAD_NAME
- name: Wait for workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
- name: Run a Pathways workload on Ubuntu base image
run: |
python3 xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME \
--docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b \
--command "echo \"Hello world from a test script! \""
- name: Verify if the file was created in the GCS bucket
run: gsutil cp gs://xpk-ci-cd-tests/$RANDOM_SEED.txt .
- name: Check if the file contains desired content
run: grep 'Hello world from a Github Action CI/CD test script!' $RANDOM_SEED.txt
- name: Wait for Pathways workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
- name: List out the workloads on the cluster
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
# - name: Delete the workload on the cluster
# run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
# - name: Delete the Pathways workload on the cluster
# run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
# - name: Delete created GCS file
# run: gsutil rm gs://xpk-ci-cd-tests/$RANDOM_SEED.txt
# - name: Delete existing Storage
# run: python3 xpk.py storage delete $STORAGE_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
# - name: Delete the cluster created
# if: always()
# run: python3 xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b