Test

AI-Hypercomputer · Jan 15, 2025 · f61369f · f61369f
1 parent 6b2eb4d
commit f61369f
Show file tree

Hide file tree

Showing 5 changed files with 157 additions and 150 deletions.
diff --git a/.github/workflows/build_tests.yaml b/.github/workflows/build_tests.yaml
@@ -44,63 +44,63 @@ env:
   REGION: us-central2
 
 jobs:
-  run-unit-tests:
-    runs-on: [ubuntu-22.04]
-    concurrency: # We support one build or nightly test to run at a time currently.
-      group: build-test-cluster-group
-      cancel-in-progress: false
-    steps:
-    - uses: actions/checkout@v4
-    - uses: actions/setup-python@v5
-      with:
-        python-version: '3.10'
-    - uses: google-github-actions/setup-gcloud@v2
-      with:
-        version: '>= 363.0.0'
-        install_components: 'beta, gke-gcloud-auth-plugin'
-    - name: Install dependencies
-      run : make install-dev
-    - name: Run unit tests
-      run: make run-unittests
+  # run-unit-tests:
+  #   runs-on: [ubuntu-22.04]
+  #   concurrency: # We support one build or nightly test to run at a time currently.
+  #     group: build-test-cluster-group
+  #     cancel-in-progress: false
+  #   steps:
+  #   - uses: actions/checkout@v4
+  #   - uses: actions/setup-python@v5
+  #     with:
+  #       python-version: '3.10'
+  #   - uses: google-github-actions/setup-gcloud@v2
+  #     with:
+  #       version: '>= 363.0.0'
+  #       install_components: 'beta, gke-gcloud-auth-plugin'
+  #   - name: Install dependencies
+  #     run : make install-dev
+  #   - name: Run unit tests
+  #     run: make run-unittests
 
-  run-integration-tests:
-    runs-on: [ubuntu-22.04]
-    needs: [run-unit-tests]
-    concurrency: # We support one build or nightly test to run at a time currently.
-      group: build-test-cluster-group
-      cancel-in-progress: false
-    steps:
-    - uses: actions/checkout@v4
-    - uses: actions/setup-python@v5
-      with:
-        python-version: '3.10'
-    - uses: 'google-github-actions/auth@v2'
-      with:
-        credentials_json: '${{ secrets.GCP_SA_KEY }}'
-    - uses: google-github-actions/setup-gcloud@v2
-      with:
-        version: '>= 363.0.0'
-        install_components: 'beta,gke-gcloud-auth-plugin, gke-gcloud-auth-plugin'
-    - name: Verify gcp setup
-      run: gcloud info
-    - name: Install dependencies
-      run : make install-dev
-    - name: "Set auth cidr"
-      run: echo "AUTH_CIDR=$(curl api.ipify.org)/32" >> $GITHUB_ENV
-    - name: "Set GCLOUD_CFG_PATH"
-      run: echo "GCLOUD_CFG_PATH=/home/runner/work/xpk/xpk/" >> $GITHUB_ENV
-    - name: "Copy credentials"
-      run: cp $GOOGLE_APPLICATION_CREDENTIALS $GCLOUD_CFG_PATH/application_default_credentials.json
-    - name: "Set DEPLOYMENT_DIR"
-      run: echo "DEPLOYMENT_DIR=$HOME/deployment" >> $GITHUB_ENV
-    - name: Create deployment dir
-      run: mkdir -p $DEPLOYMENT_DIR
-    - name: Run integration tests
-      run: make run-integrationtests
+  # run-integration-tests:
+  #   runs-on: [ubuntu-22.04]
+  #   needs: [run-unit-tests]
+  #   concurrency: # We support one build or nightly test to run at a time currently.
+  #     group: build-test-cluster-group
+  #     cancel-in-progress: false
+  #   steps:
+  #   - uses: actions/checkout@v4
+  #   - uses: actions/setup-python@v5
+  #     with:
+  #       python-version: '3.10'
+  #   - uses: 'google-github-actions/auth@v2'
+  #     with:
+  #       credentials_json: '${{ secrets.GCP_SA_KEY }}'
+  #   - uses: google-github-actions/setup-gcloud@v2
+  #     with:
+  #       version: '>= 363.0.0'
+  #       install_components: 'beta,gke-gcloud-auth-plugin, gke-gcloud-auth-plugin'
+  #   - name: Verify gcp setup
+  #     run: gcloud info
+  #   - name: Install dependencies
+  #     run : make install-dev
+  #   - name: "Set auth cidr"
+  #     run: echo "AUTH_CIDR=$(curl api.ipify.org)/32" >> $GITHUB_ENV
+  #   - name: "Set GCLOUD_CFG_PATH"
+  #     run: echo "GCLOUD_CFG_PATH=/home/runner/work/xpk/xpk/" >> $GITHUB_ENV
+  #   - name: "Copy credentials"
+  #     run: cp $GOOGLE_APPLICATION_CREDENTIALS $GCLOUD_CFG_PATH/application_default_credentials.json
+  #   - name: "Set DEPLOYMENT_DIR"
+  #     run: echo "DEPLOYMENT_DIR=$HOME/deployment" >> $GITHUB_ENV
+  #   - name: Create deployment dir
+  #     run: mkdir -p $DEPLOYMENT_DIR
+  #   - name: Run integration tests
+  #     run: make run-integrationtests
 
   cluster-create-and-delete:
     runs-on: [ubuntu-22.04]
-    needs: [run-integration-tests]
+    #needs: [run-integration-tests]
     concurrency: # We support one nightly test and one build test for each branch to run at a time currently.
       group: build-test-cluster-group-${{ github.ref }}
       cancel-in-progress: false
@@ -154,12 +154,12 @@ jobs:
       run: gcloud auth configure-docker --quiet
     - name: Create test script to execute in workloads
       run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
-    - name: Run a base-docker-image workload
-      run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME  --command "bash workload.sh"  --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b
-    - name: Run xpk inspector with the workload created above
-      run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=us-central2-b  --workload $WORKLOAD_NAME
-    - name: Wait for workload completion and confirm it succeeded
-      run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
+    # - name: Run a base-docker-image workload
+    #   run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME  --command "bash workload.sh"  --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b
+    # - name: Run xpk inspector with the workload created above
+    #   run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=us-central2-b  --workload $WORKLOAD_NAME
+    # - name: Wait for workload completion and confirm it succeeded
+    #   run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
     - name: Run a Pathways workload on Ubuntu base image
       run: python xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
     - name: Wait for Pathways workload completion and confirm it succeeded

diff --git a/.github/workflows/lint_and_format.yml b/.github/workflows/lint_and_format.yml
@@ -16,9 +16,9 @@ name: Lint and Format
 
 on:
   pull_request:
-  push:
-    branches:
-      - main
+  # push:
+  #   branches:
+  #     - main
 
 jobs:
   build-and-test:

diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py
@@ -224,10 +224,6 @@
   failurePolicy:
     {failure_policy_rules}
     maxRestarts: {args.max_restarts}
-  successPolicy:
-    operator: "All"
-    targetReplicatedJobs:
-    - {args.targetReplicatedJob}
   replicatedJobs:
     - name: worker
       replicas: {args.num_slices}
@@ -382,18 +378,18 @@ def workload_create(args) -> None:
         " -c 'import pathwaysutils; import jax; print(jax.devices())'"
     )
 
-  set_cluster_command_code = set_cluster_command(args)
-  if set_cluster_command_code != 0:
-    xpk_exit(set_cluster_command_code)
+  # set_cluster_command_code = set_cluster_command(args)
+  # if set_cluster_command_code != 0:
+  #   xpk_exit(set_cluster_command_code)
 
-  workload_exists = check_if_workload_exists(args)
+  # workload_exists = check_if_workload_exists(args)
 
-  if workload_exists:
-    xpk_print(
-        f'{args.workload} already exists, XPK will not create this workload.'
-        ' Please pick a new workload name'
-    )
-    xpk_exit(1)
+  # if workload_exists:
+  #   xpk_print(
+  #       f'{args.workload} already exists, XPK will not create this workload.'
+  #       ' Please pick a new workload name'
+  #   )
+  #   xpk_exit(1)
 
   xpk_print('Starting workload create', flush=True)
   system, return_code = get_system_characteristics(args)
@@ -483,79 +479,79 @@ def workload_create(args) -> None:
     if return_code != 0:
       xpk_exit(return_code)
 
-    if system.device_type in cluster_gcluster.supported_device_types:
-      yml_string = a3_gpu_workload_create_yaml.format(
-          args=args,
-          container=container,
-          failure_policy_rules=failure_policy_rules,
-          pod_failure_policy=pod_failure_policy,
-      )
-
-      if args.device_type == cluster_gcluster.a3mega_device_type:
-        sub_networks = [f'{args.cluster}-gpunet-{i}-subnet' for i in range(8)]
-        yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)
-
-      if args.device_type == cluster_gcluster.a3ultra_device_type:
-        sub_networks = [f'{args.cluster}-sub-1'] + [
-            f'{args.cluster}-rdma-sub-{i}' for i in range(8)
-        ]
-        yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
-    else:
-      yml_string = gpu_workload_create_yaml.format(
-          args=args,
-          container=container,
-          command=args.command,
-          chips_per_vm=system.chips_per_vm,
-          gpu_scheduler=gpu_scheduler,
-          gpu_volume=get_gpu_volume(system),
-          gpu_rxdm_image=get_gpu_rxdm_image(system),
-          gpu_rxdm_cmd=get_gpu_rxdm_cmd(system),
-          gpu_tcp_volume=get_gpu_tcp_volume(system),
-          failure_policy_rules=failure_policy_rules,
-          pod_failure_policy=pod_failure_policy,
-      )
-  elif args.use_pathways and ensure_pathways_workload_prerequisites(
-      args, system
-  ):
-    yml_string = pw_workload_create_yaml.format(
-        args=args,
-        system=system,
-        accelerator_label=create_accelerator_label(
-            system.accelerator_type, system
-        ),
-        machine_label=create_machine_label(system.accelerator_type, system),
-        pathways_rm_args=get_pathways_rm_args(args, system),
-        pathways_worker_args=get_pathways_worker_args(args),
-        pathways_proxy_args=get_pathways_proxy_args(args),
-        user_workload=get_user_workload_for_pathways(args, system),
-        resource_type=AcceleratorTypeToAcceleratorCharacteristics[
-            system.accelerator_type
-        ].resource_type,
-        local_queue_name=LOCAL_QUEUE_NAME,
-        autoprovisioning_args=autoprovisioning_args,
-        backoff_limit=system.vms_per_slice * 4,
-        failure_policy_rules=failure_policy_rules,
-        pod_failure_policy=pod_failure_policy,
-    )
-  else:
-    container, debugging_dashboard_id = get_user_workload_container(
-        args, system
-    )
-    yml_string = workload_create_yaml.format(
-        args=args,
-        system=system,
-        container=container,
-        affinity=get_cpu_affinity(system.accelerator_type),
-        accelerator_label=create_accelerator_label(
-            system.accelerator_type, system
-        ),
-        machine_label=create_machine_label(system.accelerator_type, system),
-        local_queue_name=LOCAL_QUEUE_NAME,
-        autoprovisioning_args=autoprovisioning_args,
-        volumes=get_volumes(args, system),
-        failure_policy_rules=failure_policy_rules,
-        pod_failure_policy=pod_failure_policy,
-    )
+    # if system.device_type in cluster_gcluster.supported_device_types:
+    #   yml_string = a3_gpu_workload_create_yaml.format(
+    #       args=args,
+    #       container=container,
+    #       failure_policy_rules=failure_policy_rules,
+    #       pod_failure_policy=pod_failure_policy,
+    #   )
+
+    #   if args.device_type == cluster_gcluster.a3mega_device_type:
+    #     sub_networks = [f'{args.cluster}-gpunet-{i}-subnet' for i in range(8)]
+    #     yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)
+
+    #   if args.device_type == cluster_gcluster.a3ultra_device_type:
+    #     sub_networks = [f'{args.cluster}-sub-1'] + [
+    #         f'{args.cluster}-rdma-sub-{i}' for i in range(8)
+    #     ]
+    #     yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
+    # else:
+    #   yml_string = gpu_workload_create_yaml.format(
+    #       args=args,
+    #       container=container,
+    #       command=args.command,
+    #       chips_per_vm=system.chips_per_vm,
+    #       gpu_scheduler=gpu_scheduler,
+    #       gpu_volume=get_gpu_volume(system),
+    #       gpu_rxdm_image=get_gpu_rxdm_image(system),
+    #       gpu_rxdm_cmd=get_gpu_rxdm_cmd(system),
+    #       gpu_tcp_volume=get_gpu_tcp_volume(system),
+    #       failure_policy_rules=failure_policy_rules,
+    #       pod_failure_policy=pod_failure_policy,
+    #   )
+  # elif args.use_pathways and ensure_pathways_workload_prerequisites(
+  #     args, system
+  # ):
+  yml_string = pw_workload_create_yaml.format(
+      args=args,
+      system=system,
+      accelerator_label=create_accelerator_label(
+          system.accelerator_type, system
+      ),
+      machine_label=create_machine_label(system.accelerator_type, system),
+      pathways_rm_args=get_pathways_rm_args(args, system),
+      pathways_worker_args=get_pathways_worker_args(args),
+      pathways_proxy_args=get_pathways_proxy_args(args),
+      user_workload=get_user_workload_for_pathways(args, system),
+      resource_type=AcceleratorTypeToAcceleratorCharacteristics[
+          system.accelerator_type
+      ].resource_type,
+      local_queue_name=LOCAL_QUEUE_NAME,
+      autoprovisioning_args=autoprovisioning_args,
+      backoff_limit=system.vms_per_slice * 4,
+      failure_policy_rules=failure_policy_rules,
+      pod_failure_policy=pod_failure_policy,
+  )
+  # else:
+  #   container, debugging_dashboard_id = get_user_workload_container(
+  #       args, system
+  #   )
+  #   yml_string = workload_create_yaml.format(
+  #       args=args,
+  #       system=system,
+  #       container=container,
+  #       affinity=get_cpu_affinity(system.accelerator_type),
+  #       accelerator_label=create_accelerator_label(
+  #           system.accelerator_type, system
+  #       ),
+  #       machine_label=create_machine_label(system.accelerator_type, system),
+  #       local_queue_name=LOCAL_QUEUE_NAME,
+  #       autoprovisioning_args=autoprovisioning_args,
+  #       volumes=get_volumes(args, system),
+  #       failure_policy_rules=failure_policy_rules,
+  #       pod_failure_policy=pod_failure_policy,
+  #   )
   tmp = write_tmp_file(yml_string)
   command = f'kubectl apply -f {str(tmp.file.name)}'
   return_code = run_command_with_updates(command, 'Creating Workload', args)

diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py
@@ -15,6 +15,7 @@
 """
 
 import datetime
+import os
 import subprocess
 import sys
 import time
@@ -84,7 +85,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
     children.append(
         # subprocess managed by list pylint: disable=consider-using-with
         subprocess.Popen(
-            command, stdout=output_logs[i], stderr=output_logs[i], shell=True
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
         )
     )
 

diff --git a/src/xpk/core/core.py b/src/xpk/core/core.py
@@ -2756,6 +2756,16 @@ def wait_for_job_completion(args) -> int:
     return return_code
   full_workload_name = return_value.split(' ')[0]
 
+
+  # Describe workload name
+  describe_workload = f'kubectl describe workload {args.workload} -o yaml'
+  return_code, return_value = run_commands(
+      describe_workload, 'Describe workload', args
+  )
+  if return_code != 0:
+    xpk_print(f'Describe workload name request returned ERROR {return_code}')
+    return return_code
+
   # Call kubectl wait on the workload using the full workload name
   timeout_val = args.timeout if args.timeout is not None else -1
   timeout_msg = (