diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml index 9111f2f1982..86cb786647f 100644 --- a/.github/workflows/e2e-test-tune-api.yaml +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -22,10 +22,10 @@ jobs: with: kubernetes-version: ${{ matrix.kubernetes-version }} - - name: Install Training Operator SDK + - name: Install Katib SDK with extra requires shell: bash run: | - pip install "kubeflow-training[huggingface]==1.8.1" + pip install --prefer-binary -e 'sdk/python/v1beta1[huggingface]' - name: Check Disk Space Before Test run: | @@ -99,5 +99,5 @@ jobs: strategy: fail-fast: false matrix: - # Kubernetes versions to test with - kubernetes-version: ["v1.29.2"] + # Detail: https://hub.docker.com/r/kindest/node + kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"] diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index cb95b20f31b..e5d19da8256 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -415,7 +415,9 @@ class name in this argument. experiment.spec.max_failed_trial_count = max_failed_trial_count # If users choose to use a custom objective function. - if objective is not None: + if objective is not None or parameters is not None: + if not objective or not parameters: + raise ValueError("One of the required parameters is None") # Add metrics collector to the Katib Experiment. # Up to now, we only support parameter `kind`, of which default value # is `StdOut`, to specify the kind of metrics collector. @@ -518,6 +520,7 @@ class name in this argument. from kubeflow.storage_initializer.hugging_face import ( HuggingFaceDatasetParams, HuggingFaceModelParams, + HuggingFaceTrainerParams, ) from kubeflow.storage_initializer.s3 import S3DatasetParams from kubeflow.training import models as training_models @@ -596,6 +599,11 @@ class name in this argument. "or HuggingFaceDatasetParams." ) + if not isinstance(trainer_parameters, HuggingFaceTrainerParams): + raise ValueError( + "Trainer parameters must be an instance of HuggingFaceTrainerParams." + ) + # Iterate over input parameters and do substitutions. experiment_params = [] trial_params = [] @@ -645,7 +653,11 @@ class name in this argument. f"'{training_args}'", ], volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT], - resources=resources_per_trial.resources_per_worker, + resources=( + resources_per_trial.resources_per_worker + if resources_per_trial + else None + ), ) # Create the worker and the master pod. @@ -656,27 +668,15 @@ class name in this argument. ), ) - worker_pod_template_spec = models.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), - spec=models.V1PodSpec( - containers=[container_spec], - volumes=[storage_initializer_volume], - termination_grace_period_seconds=60, - ), + worker_pod_template_spec = training_utils.get_pod_template_spec( + containers=[container_spec], + volumes=[storage_initializer_volume], ) - master_pod_template_spec = models.V1PodTemplateSpec( - metadata=models.V1ObjectMeta( - annotations={"sidecar.istio.io/inject": "false"} - ), - spec=models.V1PodSpec( - init_containers=[init_container_spec], - containers=[container_spec], - volumes=[storage_initializer_volume], - termination_grace_period_seconds=60, - ), + master_pod_template_spec = training_utils.get_pod_template_spec( + containers=[container_spec], + init_containers=[init_container_spec], + volumes=[storage_initializer_volume], ) # Create PyTorchJob. @@ -691,7 +691,10 @@ class name in this argument. ), ) - if resources_per_trial.num_procs_per_worker: + if ( + resources_per_trial is not None + and resources_per_trial.num_procs_per_worker + ): pytorchjob.spec.nproc_per_node = str( resources_per_trial.num_procs_per_worker ) @@ -703,7 +706,7 @@ class name in this argument. ) ) - if resources_per_trial.num_workers > 1: + if resources_per_trial is not None and resources_per_trial.num_workers > 1: pytorchjob.spec.pytorch_replica_specs["Worker"] = ( training_models.KubeflowOrgV1ReplicaSpec( replicas=resources_per_trial.num_workers - 1,