Skip to content

Commit

Permalink
Workloads implemented for A3-Mega and A3-Ultra machines (#306)
Browse files Browse the repository at this point in the history
* A3Mega wokload implemented
* A3Ultra workload implemented
  • Loading branch information
sharabiani authored Dec 28, 2024
1 parent ff68169 commit 0e41cd6
Show file tree
Hide file tree
Showing 20 changed files with 742 additions and 151 deletions.
141 changes: 141 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# editor and IDE paraphernalia
.idea/

*__pycache__*
tmp/
.pytype
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so
bin/
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# DS_Store files
**/.DS_Store

# XPK/Cluster Toolkit working directory
xpkclusters/*
40 changes: 38 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,10 @@ xpk supports the following TPU types:
* Trillium (v6e)

and the following GPU types:
* a100
* a3 (h100)
* A100
* A3-Highgpu (h100)
* A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-and-a3-mega-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-and-a3-mega-clusters-gpu-machines)
* A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-and-a3-mega-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-and-a3-mega-clusters-gpu-machines)

and the following CPU types:
* n2-standard-32
Expand Down Expand Up @@ -397,6 +399,26 @@ will fail the cluster creation process because Vertex AI Tensorboard is not supp
--tpu-type=v5litepod-16
```

## Provisioning A3-Ultra and A3-Mega clusters (GPU machines)
To create a cluster with A3 machines, run the below command. To create workloads on these clusters see [here](#workloads-for-a3-ultra-and-a3-mega-clusters-gpu-machines).
* For A3-Ultra: --device-type=h200-141gb-8
* For A3-Mega: --device-type=h100-mega-80gb-8

```shell
python3 xpk.py cluster create \
--cluster CLUSTER_NAME --device-type=h200-141gb-8 \
--zone=$COMPUTE_ZONE --project=$PROJECT_ID \
--num-nodes=4 --reservation=$RESERVATION_ID
```
Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra machines:
* --num-nodes
* --default-pool-cpu-machine-type
* --default-pool-cpu-num-nodes
* --reservation
* --spot
* --on-demand (only A3-Mega)


## Workload Create
* Workload Create (submit training job):

Expand Down Expand Up @@ -463,6 +485,20 @@ increase this to a large number, say 50. Real jobs can be interrupted due to
hardware failures and software updates. We assume your job has implemented
checkpointing so the job restarts near where it was interrupted.
### Workloads for A3-Ultra and A3-Mega clusters (GPU machines)
To submit jobs on a cluster with A3 machines, run the below command. To create a cluster with A3 machines see [here](#provisioning-a3-ultra-and-a3-mega-clusters-gpu-machines).
* For A3-Ultra: --device-type=h200-141gb-8
* For A3-Mega: --device-type=h100-mega-80gb-8
```shell
python3 xpk.py workload create \
--workload=$WORKLOAD_NAME --command="echo goodbye" \
--cluster=$CLUSTER_NAME --device-type=h200-141gb-8 \
--zone=$COMPUTE_ZONE --project=$PROJECT_ID \
--num-nodes=$WOKRKLOAD_NUM_NODES
```
> The docker image flags/arguments introduced in [workloads section](#workload-create) can be used with A3 machines as well.
### Workload Priority and Preemption
* Set the priority level of your workload with `--priority=LEVEL`
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ dependencies = [
"cloud-accelerator-diagnostics",
"tabulate",
"ruamel.yaml",
"pyyaml",
"docker"
]

Expand Down Expand Up @@ -62,7 +63,7 @@ dev = [
version = {attr = "xpk.core.core.__version__"}

[tool.setuptools]
packages = ["xpk", "xpk.parser", "xpk.core", "xpk.commands", "xpk.utils", "xpk.core.blueprint"]
packages = ["xpk", "xpk.parser", "xpk.core", "xpk.commands", "xpk.utils", "xpk.core.blueprint", "xpk.core.workload_decorators"]
package-dir = {"" = "src"}

[tool.pyink]
Expand Down
11 changes: 10 additions & 1 deletion src/xpk/blueprints/a3mega/config-map.yaml.tftpl
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
kind: ConfigMap
apiVersion: v1
metadata:
name: ${name}
name: ${resource_config_name}
data:
h100-mega-80gb-8: "${num_nodes}"
---
kind: ConfigMap
apiVersion: v1
metadata:
name: ${cluster_config_name}
data:
capacity_type: "${capacity_type}"
reservation_id: "${reservation}"
provisioner: gcluster
9 changes: 5 additions & 4 deletions src/xpk/blueprints/a3mega/kueue-xpk-configuration.yaml.tftpl
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,18 @@ kind: ClusterQueue
metadata:
name: cluster-queue
spec:
preemption:
reclaimWithinCohort: Never # Don't preempt other queues in the cohort.
withinClusterQueue: LowerPriority
namespaceSelector: {} # match all.
resourceGroups:
- coveredResources: ["nvidia.com/gpu"]
- coveredResources: ["nvidia.com/gpu", "cpu", "memory"]
flavors:
- name: 1xh100-mega-80gb-8
resources:
- name: "nvidia.com/gpu"
nominalQuota: ${num_chips}
- name: "cpu"
nominalQuota: 10000
- name: "memory"
nominalQuota: 10000Gi
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
Expand Down
10 changes: 9 additions & 1 deletion src/xpk/blueprints/a3ultra/config-map.yaml.tftpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,12 @@ metadata:
name: ${resource_config_name}
data:
h200-141gb-8: "${num_nodes}"
nvidia-h200-141gb: "${num_nodes}"
---
kind: ConfigMap
apiVersion: v1
metadata:
name: ${cluster_config_name}
data:
capacity_type: "${capacity_type}"
reservation_id: "${reservation}"
provisioner: gcluster
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,11 @@ kind: ClusterQueue
metadata:
name: cluster-queue
spec:
preemption:
reclaimWithinCohort: Never # Don't preempt other queues in the cohort.
withinClusterQueue: LowerPriority
namespaceSelector: {} # match all.
resourceGroups:
- coveredResources: ["nvidia.com/gpu", "cpu", "memory"]
flavors:
- name: 1xh200-141gb-8g
- name: 1xh200-141gb-8
resources:
- name: "nvidia.com/gpu"
nominalQuota: ${num_chips}
Expand Down
32 changes: 9 additions & 23 deletions src/xpk/commands/cluster_gcluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from ..core.blueprint.blueprint_generator import BlueprintGenerator, BlueprintGeneratorOutput, supported_device_types, a3mega_device_type, a3ultra_device_type
from ..core.docker_manager import DockerManager
from ..core.gcluster_manager import GclusterManager
from ..core.core import zone_to_region
from ..core.core import zone_to_region, get_capacity_type
from ..utils.console import xpk_exit, xpk_print
from ..utils.network import all_IPs_cidr
from ..utils.file import ensure_directory_exists
Expand Down Expand Up @@ -142,7 +142,11 @@ def prepare_blueprint_generator() -> BlueprintGenerator:
def generate_blueprint(
blueprint_name, args, prefix=None
) -> BlueprintGeneratorOutput:
validate_consumption_args(args)
capacity_type, return_code = get_capacity_type(args)
if return_code != 0:
xpk_print('Capacity type is invalid.')
xpk_exit(return_code)

bpg = prepare_blueprint_generator()

if args.device_type in supported_device_types:
Expand All @@ -157,9 +161,8 @@ def generate_blueprint(
zone=args.zone,
auth_cidr=all_IPs_cidr,
num_nodes=num_nodes,
autoscaling_total_min_nodes=num_nodes,
reservation=args.reservation if args.reservation else None,
spot=args.spot if args.spot else False,
capacity_type=capacity_type,
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
)
Expand All @@ -173,27 +176,10 @@ def generate_blueprint(
project_id=args.project,
zone=args.zone,
auth_cidr=all_IPs_cidr,
static_node_count=num_nodes,
num_nodes=num_nodes,
reservation=args.reservation if args.reservation else None,
spot=args.spot if args.spot else False,
capacity_type=capacity_type,
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
)
return None


def validate_consumption_args(args):
args_set = []
if not args.reservation is None:
args_set.append('--reservation')
if not args.spot is None and args.spot:
args_set.append('--spot')
if not args.on_demand is None and args.on_demand:
args_set.append('--on-demand')

if len(args_set) > 1:
xpk_print(
f"Error: only one of {' or '.join(args_set)} can be set at the same"
' time.'
)
xpk_exit(1)
Loading

0 comments on commit 0e41cd6

Please sign in to comment.