Skip to content

Commit

Permalink
Merge branch 'ppawl-remove-kubectl-from-makefile' of https://github.c…
Browse files Browse the repository at this point in the history
…om/AI-Hypercomputer/xpk into ppawl-remove-kubectl-from-makefile
  • Loading branch information
pawloch00 committed Jan 15, 2025
2 parents 24051c7 + a58696c commit 49059fa
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 12 deletions.
21 changes: 15 additions & 6 deletions .github/workflows/build_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ on:
type: choice
options:
- v4-8
- v5litepod-8
push:
branches: ["main"]
pull_request: # By default this runs for types assigned, opened and synchronize.
Expand All @@ -35,7 +36,6 @@ env:
WORKLOAD_NAME: xpktest-build-${{ github.run_attempt }}
PATHWAYS_WORKLOAD_NAME: xpkpw-build-${{ github.run_attempt }}
CLUSTER_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}} --maintenance-window=23:50"
RUN_ID: "pr-${{ github.event.number }}"
PROJECT_ID: ${{secrets.PROJECT_NAME}}
A3_MEGA_TEST_CLUSTER_NAME: "xpk-mega-ctk-int"
A3_ULTRA_TEST_CLUSTER_NAME: "xpk-ultra-ctk-int"
Expand Down Expand Up @@ -105,11 +105,18 @@ jobs:
group: build-test-cluster-group-${{ github.ref }}
cancel-in-progress: false
steps:
- name: Change RUN_ID env var if merge to main
run: echo "RUN_ID=main" >> $GITHUB_ENV
if: ${{ github.ref == 'refs/heads/main' }}
- name: Initialize RUN_ID env var
run: |
if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
RUN_ID="dispatch"
elif [ "${{ github.ref }}" == "refs/heads/main" ]; then
RUN_ID="main"
else
RUN_ID="pr-${{ github.event.number }}"
fi
echo "RUN_ID=$RUN_ID" >> $GITHUB_ENV
- name: Update cluster name with TPU_TYPE and RUN_ID
run: echo "TPU_CLUSTER_NAME=$TPU_CLUSTER_NAME-$TPU_TYPE-$RUN_ID" >> $GITHUB_ENV
run: echo "TPU_CLUSTER_NAME=$TPU_CLUSTER_NAME-$RUN_ID" >> $GITHUB_ENV
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
Expand Down Expand Up @@ -141,6 +148,8 @@ jobs:
run: python xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --private --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
- name: Verify the created cluster is private
run: gcloud container clusters describe $TPU_CLUSTER_NAME --region=us-central2 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
- name: List out the nodepools on the cluster
run: python xpk.py cluster describe --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep -P '^(?=.*NODEPOOL_NAME)(?=.*SLICE)(?=.*TYPE)(?=.*EXPECTED_HEALTHY_NODES)(?=.*ACTUAL_HEALTHY_NODES)(?=.*TOTAL_NODES)'
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Create test script to execute in workloads
Expand Down Expand Up @@ -202,7 +211,7 @@ jobs:
run: python3 xpk.py shell stop
- name: Delete the cluster created
if: always()
run: echo 'y' | python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --force



Expand Down
193 changes: 187 additions & 6 deletions src/xpk/commands/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
from ..utils.console import xpk_exit, xpk_print
from . import cluster_gcluster

from tabulate import tabulate


def cluster_create(args) -> None:
"""Function around cluster creation.
Expand Down Expand Up @@ -323,15 +325,18 @@ def cluster_describe(args) -> None:
if set_cluster_command_code != 0:
xpk_exit(set_cluster_command_code)

command = (
f'gcloud container node-pools list --cluster {args.cluster} '
f'--project={args.project} --region={zone_to_region(args.zone)}'
)

return_code = run_command_with_updates(command, 'Cluster nodepool list', args)
return_code, data_table = nodepools_build_table(args)
if return_code != 0:
xpk_exit(return_code)

if len(data_table) > 1:
xpk_print(
'Nodepools info:\n',
tabulate(data_table, headers='firstrow', tablefmt='plain'),
)
else:
xpk_print('No nodepools info found')

return_code_node_output, node_output = run_command_for_value(
r'kubectl get node --no-headers=true'
r" --selector='cloud.google.com/gke-tpu-accelerator' | wc -l",
Expand Down Expand Up @@ -362,6 +367,182 @@ def cluster_describe(args) -> None:
xpk_exit(0)


def nodepools_build_table(args) -> tuple[int, list[list]]:
table = [[
'NODEPOOL_NAME',
'SLICE',
'TYPE',
'EXPECTED_HEALTHY_NODES',
'ACTUAL_HEALTHY_NODES',
'TOTAL_NODES',
]]

nodepools_data = {}

nodepools, return_code = get_node_pools_name(args)
if return_code != 0:
xpk_print(f'Get node pools name returned ERROR {return_code}')

for name in nodepools:
nodepools_data[name] = [name]

slices, return_code = get_slice_node_pool_size(args)
if return_code != 0:
xpk_print(f'Get slice node pool size returned ERROR {return_code}')

for line in slices:
s = line.split()
count, nodepool_name = s[0], s[1]
nodepools_data[nodepool_name].append(count)

type_nodepool, return_code = get_node_pool_instance_type(args)
if return_code != 0:
xpk_print(f'Get node pool instance type returned ERROR {return_code}')

for line in type_nodepool:
tn = line.split()
nodepool_name, instance_type = tn[0], tn[1]
nodepools_data[nodepool_name].append(instance_type)

expected_healthy_nodes, return_code = get_expected_healthy_nodes(args)
if return_code != 0:
xpk_print(f'Get expected healthy nodes returned ERROR {return_code}')

for line in expected_healthy_nodes:
ehn = line.split()
count, nodepool_name = ehn[0], ehn[1]
nodepools_data[nodepool_name].append(count)

actual_healthy_nodes, return_code = get_actual_healthy_nodes(args)
if return_code != 0:
xpk_print(f'Get actual healthy nodes returned ERROR {return_code}')

for line in actual_healthy_nodes:
ahn = line.split()
count, nodepool_name = ahn[0], ahn[1]
nodepools_data[nodepool_name].append(count)

total_nodes, return_code = get_total_nodes_per_node_pool(args)
if return_code != 0:
xpk_print(f'Get total nodes per node pool returned ERROR {return_code}')

for line in total_nodes:
tn = line.split()
count, nodepool_name = tn[0], tn[1]
nodepools_data[nodepool_name].append(count)

for _, np_data in nodepools_data.items():
table.append(np_data)

return 0, table


def get_node_pools_name(args) -> tuple[list[str], int]:
cmd_nodepools = (
'kubectl get node --no-headers=true -o'
" custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool'"
" | grep -v 'none' | sort | uniq"
)
return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list', args)
if return_code != 0:
return [], return_code

return out.splitlines(), 0


def get_slice_node_pool_size(args) -> tuple[list[str], int]:
cmd_slices = (
'kubectl get node --no-headers=true -o'
" custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
" | grep -v 'none'"
' | sort'
' | uniq -c'
)
return_code, out = run_command_for_value(
cmd_slices, 'Count nodes per nodepool slice', args
)
if return_code != 0:
return [], return_code

return out.splitlines(), 0


def get_node_pool_instance_type(args) -> tuple[list[str], int]:
cmd_type_nodepool = (
'kubectl get node --no-headers=true -o'
" custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool,"
" TYPE:.metadata.labels.node\\.kubernetes\\.io/instance-type' | grep -v"
" 'none' | sort | uniq"
)
return_code, out = run_command_for_value(
cmd_type_nodepool, 'Instance type of nodepools', args
)
if return_code != 0:
return [], return_code

return out.splitlines(), 0


def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
cmd_expected_healthy_nodes = (
'kubectl get node --no-headers=true -o'
" custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
" | grep -v 'none'"
' | sort'
' | uniq -c'
)
return_code, out = run_command_for_value(
cmd_expected_healthy_nodes,
'Count expected healthy nodes per nodepool',
args,
)
if return_code != 0:
return [], return_code

return out.splitlines(), 0


def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
cmd_actual_healthy_nodes = (
'kubectl get node --no-headers=true -o'
" custom-columns='NODE_NAME:metadata.name,"
' READY_STATUS:.status.conditions[?(@.type=="Ready")].status,'
" NODEPOOL:metadata.labels.cloud\\.google\\.com/gke-nodepool' "
' | grep -w True'
" | grep -v 'none'"
" | awk {'print $3'}"
' | sort'
' | uniq -c'
)
return_code, out = run_command_for_value(
cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool', args
)
if return_code != 0:
return [], return_code

return out.splitlines(), 0


def get_total_nodes_per_node_pool(args) -> tuple[list[str], int]:
cmd_total_nodes = (
'kubectl get node --no-headers=true -o'
" custom-columns='NODE_NAME:metadata.name,"
' READY_STATUS:.status.conditions[?(@.type=="Ready")].status,'
" NODEPOOL:metadata.labels.cloud\\.google\\.com/gke-nodepool'"
" | grep -v 'none'"
" | awk {'print $3'}"
' | sort'
' | uniq -c'
)
return_code, out = run_command_for_value(
cmd_total_nodes, 'Count total nodes per nodepool', args
)
if return_code != 0:
return [], return_code

return out.splitlines(), 0


def cluster_list(args) -> None:
"""Function around cluster list.
Expand Down
3 changes: 3 additions & 0 deletions src/xpk/core/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2311,6 +2311,9 @@ def get_main_container_resources(
if system.accelerator_type == AcceleratorType['GPU']:
return gpu_resources_yaml.format(system=system)

if system.accelerator_type == AcceleratorType['CPU']:
return ''

return f'{resource_type}: {system.chips_per_vm}'


Expand Down
9 changes: 9 additions & 0 deletions src/xpk/core/system_characteristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1105,6 +1105,15 @@ def get_system_characteristics_by_device_type(
'v5p-17920',
),
# v5litepod
'v5litepod-8': SystemCharacteristics(
'2x4',
2,
'tpu-v5-lite-podslice',
'ct5lp-hightpu-4t',
8,
AcceleratorType['TPU'],
'v5litepod-8',
),
'v5litepod-16': SystemCharacteristics(
'4x4',
4,
Expand Down

0 comments on commit 49059fa

Please sign in to comment.