update installation of Katib SDK with extra requires

Signed-off-by: helenxie-bit <[email protected]>
kubeflow · Jan 23, 2025 · e5bf840 · e5bf840
1 parent b1a2390
commit e5bf840
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 27 deletions.
diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml
@@ -22,10 +22,10 @@ jobs:
         with:
           kubernetes-version: ${{ matrix.kubernetes-version }}
 
-      - name: Install Training Operator SDK
+      - name: Install Katib SDK with extra requires
         shell: bash
         run: |
-          pip install "kubeflow-training[huggingface]==1.8.1"
+          pip install --prefer-binary -e 'sdk/python/v1beta1[huggingface]'
       
       - name: Check Disk Space Before Test
         run: |
@@ -99,5 +99,5 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        # Kubernetes versions to test with
-        kubernetes-version: ["v1.29.2"]
+        # Detail: https://hub.docker.com/r/kindest/node
+        kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
@@ -415,7 +415,9 @@ class name in this argument.
             experiment.spec.max_failed_trial_count = max_failed_trial_count
 
         # If users choose to use a custom objective function.
-        if objective is not None:
+        if objective is not None or parameters is not None:
+            if not objective or not parameters:
+                raise ValueError("One of the required parameters is None")
             # Add metrics collector to the Katib Experiment.
             # Up to now, we only support parameter `kind`, of which default value
             # is `StdOut`, to specify the kind of metrics collector.
@@ -518,6 +520,7 @@ class name in this argument.
                 from kubeflow.storage_initializer.hugging_face import (
                     HuggingFaceDatasetParams,
                     HuggingFaceModelParams,
+                    HuggingFaceTrainerParams,
                 )
                 from kubeflow.storage_initializer.s3 import S3DatasetParams
                 from kubeflow.training import models as training_models
@@ -596,6 +599,11 @@ class name in this argument.
                     "or HuggingFaceDatasetParams."
                 )
 
+            if not isinstance(trainer_parameters, HuggingFaceTrainerParams):
+                raise ValueError(
+                    "Trainer parameters must be an instance of HuggingFaceTrainerParams."
+                )
+
             # Iterate over input parameters and do substitutions.
             experiment_params = []
             trial_params = []
@@ -645,7 +653,11 @@ class name in this argument.
                     f"'{training_args}'",
                 ],
                 volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT],
-                resources=resources_per_trial.resources_per_worker,
+                resources=(
+                    resources_per_trial.resources_per_worker
+                    if resources_per_trial
+                    else None
+                ),
             )
 
             # Create the worker and the master pod.
@@ -656,27 +668,15 @@ class name in this argument.
                 ),
             )
 
-            worker_pod_template_spec = models.V1PodTemplateSpec(
-                metadata=models.V1ObjectMeta(
-                    annotations={"sidecar.istio.io/inject": "false"}
-                ),
-                spec=models.V1PodSpec(
-                    containers=[container_spec],
-                    volumes=[storage_initializer_volume],
-                    termination_grace_period_seconds=60,
-                ),
+            worker_pod_template_spec = training_utils.get_pod_template_spec(
+                containers=[container_spec],
+                volumes=[storage_initializer_volume],
             )
 
-            master_pod_template_spec = models.V1PodTemplateSpec(
-                metadata=models.V1ObjectMeta(
-                    annotations={"sidecar.istio.io/inject": "false"}
-                ),
-                spec=models.V1PodSpec(
-                    init_containers=[init_container_spec],
-                    containers=[container_spec],
-                    volumes=[storage_initializer_volume],
-                    termination_grace_period_seconds=60,
-                ),
+            master_pod_template_spec = training_utils.get_pod_template_spec(
+                containers=[container_spec],
+                init_containers=[init_container_spec],
+                volumes=[storage_initializer_volume],
             )
 
             # Create PyTorchJob.
@@ -691,7 +691,10 @@ class name in this argument.
                 ),
             )
 
-            if resources_per_trial.num_procs_per_worker:
+            if (
+                resources_per_trial is not None
+                and resources_per_trial.num_procs_per_worker
+            ):
                 pytorchjob.spec.nproc_per_node = str(
                     resources_per_trial.num_procs_per_worker
                 )
@@ -703,7 +706,7 @@ class name in this argument.
                 )
             )
 
-            if resources_per_trial.num_workers > 1:
+            if resources_per_trial is not None and resources_per_trial.num_workers > 1:
                 pytorchjob.spec.pytorch_replica_specs["Worker"] = (
                     training_models.KubeflowOrgV1ReplicaSpec(
                         replicas=resources_per_trial.num_workers - 1,