Skip to content

Commit

Permalink
Test
Browse files Browse the repository at this point in the history
  • Loading branch information
IrvingMg committed Jan 15, 2025
1 parent 6b2eb4d commit f61369f
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 150 deletions.
118 changes: 59 additions & 59 deletions .github/workflows/build_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,63 +44,63 @@ env:
REGION: us-central2

jobs:
run-unit-tests:
runs-on: [ubuntu-22.04]
concurrency: # We support one build or nightly test to run at a time currently.
group: build-test-cluster-group
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta, gke-gcloud-auth-plugin'
- name: Install dependencies
run : make install-dev
- name: Run unit tests
run: make run-unittests
# run-unit-tests:
# runs-on: [ubuntu-22.04]
# concurrency: # We support one build or nightly test to run at a time currently.
# group: build-test-cluster-group
# cancel-in-progress: false
# steps:
# - uses: actions/checkout@v4
# - uses: actions/setup-python@v5
# with:
# python-version: '3.10'
# - uses: google-github-actions/setup-gcloud@v2
# with:
# version: '>= 363.0.0'
# install_components: 'beta, gke-gcloud-auth-plugin'
# - name: Install dependencies
# run : make install-dev
# - name: Run unit tests
# run: make run-unittests

run-integration-tests:
runs-on: [ubuntu-22.04]
needs: [run-unit-tests]
concurrency: # We support one build or nightly test to run at a time currently.
group: build-test-cluster-group
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin, gke-gcloud-auth-plugin'
- name: Verify gcp setup
run: gcloud info
- name: Install dependencies
run : make install-dev
- name: "Set auth cidr"
run: echo "AUTH_CIDR=$(curl api.ipify.org)/32" >> $GITHUB_ENV
- name: "Set GCLOUD_CFG_PATH"
run: echo "GCLOUD_CFG_PATH=/home/runner/work/xpk/xpk/" >> $GITHUB_ENV
- name: "Copy credentials"
run: cp $GOOGLE_APPLICATION_CREDENTIALS $GCLOUD_CFG_PATH/application_default_credentials.json
- name: "Set DEPLOYMENT_DIR"
run: echo "DEPLOYMENT_DIR=$HOME/deployment" >> $GITHUB_ENV
- name: Create deployment dir
run: mkdir -p $DEPLOYMENT_DIR
- name: Run integration tests
run: make run-integrationtests
# run-integration-tests:
# runs-on: [ubuntu-22.04]
# needs: [run-unit-tests]
# concurrency: # We support one build or nightly test to run at a time currently.
# group: build-test-cluster-group
# cancel-in-progress: false
# steps:
# - uses: actions/checkout@v4
# - uses: actions/setup-python@v5
# with:
# python-version: '3.10'
# - uses: 'google-github-actions/auth@v2'
# with:
# credentials_json: '${{ secrets.GCP_SA_KEY }}'
# - uses: google-github-actions/setup-gcloud@v2
# with:
# version: '>= 363.0.0'
# install_components: 'beta,gke-gcloud-auth-plugin, gke-gcloud-auth-plugin'
# - name: Verify gcp setup
# run: gcloud info
# - name: Install dependencies
# run : make install-dev
# - name: "Set auth cidr"
# run: echo "AUTH_CIDR=$(curl api.ipify.org)/32" >> $GITHUB_ENV
# - name: "Set GCLOUD_CFG_PATH"
# run: echo "GCLOUD_CFG_PATH=/home/runner/work/xpk/xpk/" >> $GITHUB_ENV
# - name: "Copy credentials"
# run: cp $GOOGLE_APPLICATION_CREDENTIALS $GCLOUD_CFG_PATH/application_default_credentials.json
# - name: "Set DEPLOYMENT_DIR"
# run: echo "DEPLOYMENT_DIR=$HOME/deployment" >> $GITHUB_ENV
# - name: Create deployment dir
# run: mkdir -p $DEPLOYMENT_DIR
# - name: Run integration tests
# run: make run-integrationtests

cluster-create-and-delete:
runs-on: [ubuntu-22.04]
needs: [run-integration-tests]
#needs: [run-integration-tests]
concurrency: # We support one nightly test and one build test for each branch to run at a time currently.
group: build-test-cluster-group-${{ github.ref }}
cancel-in-progress: false
Expand Down Expand Up @@ -154,12 +154,12 @@ jobs:
run: gcloud auth configure-docker --quiet
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
- name: Run a base-docker-image workload
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b
- name: Run xpk inspector with the workload created above
run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --workload $WORKLOAD_NAME
- name: Wait for workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
# - name: Run a base-docker-image workload
# run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b
# - name: Run xpk inspector with the workload created above
# run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --workload $WORKLOAD_NAME
# - name: Wait for workload completion and confirm it succeeded
# run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
- name: Run a Pathways workload on Ubuntu base image
run: python xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
- name: Wait for Pathways workload completion and confirm it succeeded
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/lint_and_format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ name: Lint and Format

on:
pull_request:
push:
branches:
- main
# push:
# branches:
# - main

jobs:
build-and-test:
Expand Down
170 changes: 83 additions & 87 deletions src/xpk/commands/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,10 +224,6 @@
failurePolicy:
{failure_policy_rules}
maxRestarts: {args.max_restarts}
successPolicy:
operator: "All"
targetReplicatedJobs:
- {args.targetReplicatedJob}
replicatedJobs:
- name: worker
replicas: {args.num_slices}
Expand Down Expand Up @@ -382,18 +378,18 @@ def workload_create(args) -> None:
" -c 'import pathwaysutils; import jax; print(jax.devices())'"
)

set_cluster_command_code = set_cluster_command(args)
if set_cluster_command_code != 0:
xpk_exit(set_cluster_command_code)
# set_cluster_command_code = set_cluster_command(args)
# if set_cluster_command_code != 0:
# xpk_exit(set_cluster_command_code)

workload_exists = check_if_workload_exists(args)
# workload_exists = check_if_workload_exists(args)

if workload_exists:
xpk_print(
f'{args.workload} already exists, XPK will not create this workload.'
' Please pick a new workload name'
)
xpk_exit(1)
# if workload_exists:
# xpk_print(
# f'{args.workload} already exists, XPK will not create this workload.'
# ' Please pick a new workload name'
# )
# xpk_exit(1)

xpk_print('Starting workload create', flush=True)
system, return_code = get_system_characteristics(args)
Expand Down Expand Up @@ -483,79 +479,79 @@ def workload_create(args) -> None:
if return_code != 0:
xpk_exit(return_code)

if system.device_type in cluster_gcluster.supported_device_types:
yml_string = a3_gpu_workload_create_yaml.format(
args=args,
container=container,
failure_policy_rules=failure_policy_rules,
pod_failure_policy=pod_failure_policy,
)

if args.device_type == cluster_gcluster.a3mega_device_type:
sub_networks = [f'{args.cluster}-gpunet-{i}-subnet' for i in range(8)]
yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)

if args.device_type == cluster_gcluster.a3ultra_device_type:
sub_networks = [f'{args.cluster}-sub-1'] + [
f'{args.cluster}-rdma-sub-{i}' for i in range(8)
]
yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
else:
yml_string = gpu_workload_create_yaml.format(
args=args,
container=container,
command=args.command,
chips_per_vm=system.chips_per_vm,
gpu_scheduler=gpu_scheduler,
gpu_volume=get_gpu_volume(system),
gpu_rxdm_image=get_gpu_rxdm_image(system),
gpu_rxdm_cmd=get_gpu_rxdm_cmd(system),
gpu_tcp_volume=get_gpu_tcp_volume(system),
failure_policy_rules=failure_policy_rules,
pod_failure_policy=pod_failure_policy,
)
elif args.use_pathways and ensure_pathways_workload_prerequisites(
args, system
):
yml_string = pw_workload_create_yaml.format(
args=args,
system=system,
accelerator_label=create_accelerator_label(
system.accelerator_type, system
),
machine_label=create_machine_label(system.accelerator_type, system),
pathways_rm_args=get_pathways_rm_args(args, system),
pathways_worker_args=get_pathways_worker_args(args),
pathways_proxy_args=get_pathways_proxy_args(args),
user_workload=get_user_workload_for_pathways(args, system),
resource_type=AcceleratorTypeToAcceleratorCharacteristics[
system.accelerator_type
].resource_type,
local_queue_name=LOCAL_QUEUE_NAME,
autoprovisioning_args=autoprovisioning_args,
backoff_limit=system.vms_per_slice * 4,
failure_policy_rules=failure_policy_rules,
pod_failure_policy=pod_failure_policy,
)
else:
container, debugging_dashboard_id = get_user_workload_container(
args, system
)
yml_string = workload_create_yaml.format(
args=args,
system=system,
container=container,
affinity=get_cpu_affinity(system.accelerator_type),
accelerator_label=create_accelerator_label(
system.accelerator_type, system
),
machine_label=create_machine_label(system.accelerator_type, system),
local_queue_name=LOCAL_QUEUE_NAME,
autoprovisioning_args=autoprovisioning_args,
volumes=get_volumes(args, system),
failure_policy_rules=failure_policy_rules,
pod_failure_policy=pod_failure_policy,
)
# if system.device_type in cluster_gcluster.supported_device_types:
# yml_string = a3_gpu_workload_create_yaml.format(
# args=args,
# container=container,
# failure_policy_rules=failure_policy_rules,
# pod_failure_policy=pod_failure_policy,
# )

# if args.device_type == cluster_gcluster.a3mega_device_type:
# sub_networks = [f'{args.cluster}-gpunet-{i}-subnet' for i in range(8)]
# yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)

# if args.device_type == cluster_gcluster.a3ultra_device_type:
# sub_networks = [f'{args.cluster}-sub-1'] + [
# f'{args.cluster}-rdma-sub-{i}' for i in range(8)
# ]
# yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
# else:
# yml_string = gpu_workload_create_yaml.format(
# args=args,
# container=container,
# command=args.command,
# chips_per_vm=system.chips_per_vm,
# gpu_scheduler=gpu_scheduler,
# gpu_volume=get_gpu_volume(system),
# gpu_rxdm_image=get_gpu_rxdm_image(system),
# gpu_rxdm_cmd=get_gpu_rxdm_cmd(system),
# gpu_tcp_volume=get_gpu_tcp_volume(system),
# failure_policy_rules=failure_policy_rules,
# pod_failure_policy=pod_failure_policy,
# )
# elif args.use_pathways and ensure_pathways_workload_prerequisites(
# args, system
# ):
yml_string = pw_workload_create_yaml.format(
args=args,
system=system,
accelerator_label=create_accelerator_label(
system.accelerator_type, system
),
machine_label=create_machine_label(system.accelerator_type, system),
pathways_rm_args=get_pathways_rm_args(args, system),
pathways_worker_args=get_pathways_worker_args(args),
pathways_proxy_args=get_pathways_proxy_args(args),
user_workload=get_user_workload_for_pathways(args, system),
resource_type=AcceleratorTypeToAcceleratorCharacteristics[
system.accelerator_type
].resource_type,
local_queue_name=LOCAL_QUEUE_NAME,
autoprovisioning_args=autoprovisioning_args,
backoff_limit=system.vms_per_slice * 4,
failure_policy_rules=failure_policy_rules,
pod_failure_policy=pod_failure_policy,
)
# else:
# container, debugging_dashboard_id = get_user_workload_container(
# args, system
# )
# yml_string = workload_create_yaml.format(
# args=args,
# system=system,
# container=container,
# affinity=get_cpu_affinity(system.accelerator_type),
# accelerator_label=create_accelerator_label(
# system.accelerator_type, system
# ),
# machine_label=create_machine_label(system.accelerator_type, system),
# local_queue_name=LOCAL_QUEUE_NAME,
# autoprovisioning_args=autoprovisioning_args,
# volumes=get_volumes(args, system),
# failure_policy_rules=failure_policy_rules,
# pod_failure_policy=pod_failure_policy,
# )
tmp = write_tmp_file(yml_string)
command = f'kubectl apply -f {str(tmp.file.name)}'
return_code = run_command_with_updates(command, 'Creating Workload', args)
Expand Down
3 changes: 2 additions & 1 deletion src/xpk/core/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""

import datetime
import os
import subprocess
import sys
import time
Expand Down Expand Up @@ -84,7 +85,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
children.append(
# subprocess managed by list pylint: disable=consider-using-with
subprocess.Popen(
command, stdout=output_logs[i], stderr=output_logs[i], shell=True
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
)
)

Expand Down
10 changes: 10 additions & 0 deletions src/xpk/core/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2756,6 +2756,16 @@ def wait_for_job_completion(args) -> int:
return return_code
full_workload_name = return_value.split(' ')[0]


# Describe workload name
describe_workload = f'kubectl describe workload {args.workload} -o yaml'
return_code, return_value = run_commands(
describe_workload, 'Describe workload', args
)
if return_code != 0:
xpk_print(f'Describe workload name request returned ERROR {return_code}')
return return_code

# Call kubectl wait on the workload using the full workload name
timeout_val = args.timeout if args.timeout is not None else -1
timeout_msg = (
Expand Down

0 comments on commit f61369f

Please sign in to comment.