Skip to content

Commit

Permalink
update installation of Katib SDK with extra requires
Browse files Browse the repository at this point in the history
Signed-off-by: helenxie-bit <[email protected]>
  • Loading branch information
helenxie-bit committed Jan 23, 2025
1 parent b1a2390 commit e5bf840
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 27 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/e2e-test-tune-api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ jobs:
with:
kubernetes-version: ${{ matrix.kubernetes-version }}

- name: Install Training Operator SDK
- name: Install Katib SDK with extra requires
shell: bash
run: |
pip install "kubeflow-training[huggingface]==1.8.1"
pip install --prefer-binary -e 'sdk/python/v1beta1[huggingface]'
- name: Check Disk Space Before Test
run: |
Expand Down Expand Up @@ -99,5 +99,5 @@ jobs:
strategy:
fail-fast: false
matrix:
# Kubernetes versions to test with
kubernetes-version: ["v1.29.2"]
# Detail: https://hub.docker.com/r/kindest/node
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
49 changes: 26 additions & 23 deletions sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,9 @@ class name in this argument.
experiment.spec.max_failed_trial_count = max_failed_trial_count

# If users choose to use a custom objective function.
if objective is not None:
if objective is not None or parameters is not None:
if not objective or not parameters:
raise ValueError("One of the required parameters is None")
# Add metrics collector to the Katib Experiment.
# Up to now, we only support parameter `kind`, of which default value
# is `StdOut`, to specify the kind of metrics collector.
Expand Down Expand Up @@ -518,6 +520,7 @@ class name in this argument.
from kubeflow.storage_initializer.hugging_face import (
HuggingFaceDatasetParams,
HuggingFaceModelParams,
HuggingFaceTrainerParams,
)
from kubeflow.storage_initializer.s3 import S3DatasetParams
from kubeflow.training import models as training_models
Expand Down Expand Up @@ -596,6 +599,11 @@ class name in this argument.
"or HuggingFaceDatasetParams."
)

if not isinstance(trainer_parameters, HuggingFaceTrainerParams):
raise ValueError(
"Trainer parameters must be an instance of HuggingFaceTrainerParams."
)

# Iterate over input parameters and do substitutions.
experiment_params = []
trial_params = []
Expand Down Expand Up @@ -645,7 +653,11 @@ class name in this argument.
f"'{training_args}'",
],
volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT],
resources=resources_per_trial.resources_per_worker,
resources=(
resources_per_trial.resources_per_worker
if resources_per_trial
else None
),
)

# Create the worker and the master pod.
Expand All @@ -656,27 +668,15 @@ class name in this argument.
),
)

worker_pod_template_spec = models.V1PodTemplateSpec(
metadata=models.V1ObjectMeta(
annotations={"sidecar.istio.io/inject": "false"}
),
spec=models.V1PodSpec(
containers=[container_spec],
volumes=[storage_initializer_volume],
termination_grace_period_seconds=60,
),
worker_pod_template_spec = training_utils.get_pod_template_spec(
containers=[container_spec],
volumes=[storage_initializer_volume],
)

master_pod_template_spec = models.V1PodTemplateSpec(
metadata=models.V1ObjectMeta(
annotations={"sidecar.istio.io/inject": "false"}
),
spec=models.V1PodSpec(
init_containers=[init_container_spec],
containers=[container_spec],
volumes=[storage_initializer_volume],
termination_grace_period_seconds=60,
),
master_pod_template_spec = training_utils.get_pod_template_spec(
containers=[container_spec],
init_containers=[init_container_spec],
volumes=[storage_initializer_volume],
)

# Create PyTorchJob.
Expand All @@ -691,7 +691,10 @@ class name in this argument.
),
)

if resources_per_trial.num_procs_per_worker:
if (
resources_per_trial is not None
and resources_per_trial.num_procs_per_worker
):
pytorchjob.spec.nproc_per_node = str(
resources_per_trial.num_procs_per_worker
)
Expand All @@ -703,7 +706,7 @@ class name in this argument.
)
)

if resources_per_trial.num_workers > 1:
if resources_per_trial is not None and resources_per_trial.num_workers > 1:
pytorchjob.spec.pytorch_replica_specs["Worker"] = (
training_models.KubeflowOrgV1ReplicaSpec(
replicas=resources_per_trial.num_workers - 1,
Expand Down

0 comments on commit e5bf840

Please sign in to comment.