diff --git a/solutions/kueue-admission-check/README.md b/solutions/kueue-admission-check/README.md index beacb45dd..9c8766f6c 100644 --- a/solutions/kueue-admission-check/README.md +++ b/solutions/kueue-admission-check/README.md @@ -34,8 +34,6 @@ REF: [Setup a MultiKueue environment](https://kueue.sigs.k8s.io/docs/tasks/manag 2. [Kueue](https://kueue.sigs.k8s.io/docs/installation/) deployed across all clusters. 3. [Managed-serviceaccount](https://github.com/open-cluster-management-io/managed-serviceaccount), [cluster-permission](https://github.com/open-cluster-management-io/cluster-permission) and [resource-usage-collect-addon](https://github.com/open-cluster-management-io/addon-contrib/tree/main/resource-usage-collect-addon) installed on managed clusters. - - - You can set up these above by running the command: ```bash ./setup-env.sh @@ -624,18 +622,4 @@ spec: - Finally, it updates the AdmissionCheck condition to true, indicating successful generation of the `MultiKueueConfig` and `MultiKueueCluster`, readying the [MultiKueue](https://kueue.sigs.k8s.io/docs/concepts/multikueue/) environment for job scheduling. ## TODO -- In the future, the `AdmissionCheckcontroller` may be added to `featureGates` as a user-enabled feature or possibly developed into an individual component running as a pod on the `hub`. -- Users may also need to enable the `ClusterProfile` feature in the `featureGates` to utilize the OCM Admission Check. This can be done by configuring the `ClusterManager` in `hub`. -```yaml -apiVersion: operator.open-cluster-management.io/v1 -kind: ClusterManager -metadata: - name: cluster-manager -spec: - registrationConfiguration: - featureGates: - - feature: ClusterProfile - mode: Enable -... -``` - +- In the future, the `AdmissionCheckcontroller` may be added to `featureGates` as a user-enabled feature or possibly developed into an individual component running as a pod on the `hub`. \ No newline at end of file diff --git a/solutions/kueue-admission-check/env/clusterpermission.yaml b/solutions/kueue-admission-check/env/clusterpermission.yaml new file mode 100644 index 000000000..86602c9fb --- /dev/null +++ b/solutions/kueue-admission-check/env/clusterpermission.yaml @@ -0,0 +1,144 @@ +# the permission is copied from https://kueue.sigs.k8s.io/docs/tasks/manage/setup_multikueue/ +apiVersion: rbac.open-cluster-management.io/v1alpha1 +kind: ClusterPermission +metadata: + name: kueue-admin-CLUSTER_NAME + namespace: CLUSTER_NAME +spec: + clusterRole: + rules: + - apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - batch + resources: + - jobs/status + verbs: + - get + - apiGroups: + - jobset.x-k8s.io + resources: + - jobsets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - jobset.x-k8s.io + resources: + - jobsets/status + verbs: + - get + - apiGroups: + - kueue.x-k8s.io + resources: + - workloads + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - kueue.x-k8s.io + resources: + - workloads/status + verbs: + - get + - patch + - update + - apiGroups: + - kubeflow.org + resources: + - tfjobs + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - kubeflow.org + resources: + - tfjobs/status + verbs: + - get + - apiGroups: + - kubeflow.org + resources: + - paddlejobs + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - kubeflow.org + resources: + - paddlejobs/status + verbs: + - get + - apiGroups: + - kubeflow.org + resources: + - pytorchjobs + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - kubeflow.org + resources: + - pytorchjobs/status + verbs: + - get + - apiGroups: + - kubeflow.org + resources: + - xgboostjobs + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - kubeflow.org + resources: + - xgboostjobs/status + verbs: + - get + - apiGroups: + - kubeflow.org + resources: + - mpijobs + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - kubeflow.org + resources: + - mpijobs/status + verbs: + - get + clusterRoleBinding: + subject: + kind: ServiceAccount + name: kueue-admin-CLUSTER_NAME + namespace: open-cluster-management-agent-addon diff --git a/solutions/kueue-admission-check/env/cp-c1.yaml b/solutions/kueue-admission-check/env/cp-c1.yaml deleted file mode 100644 index 6f5eccf48..000000000 --- a/solutions/kueue-admission-check/env/cp-c1.yaml +++ /dev/null @@ -1,63 +0,0 @@ -apiVersion: rbac.open-cluster-management.io/v1alpha1 -kind: ClusterPermission -metadata: - name: kueue-admin-cluster1 - namespace: cluster1 -spec: - clusterRole: - rules: - - apiGroups: - - batch - resources: - - jobs - verbs: - - create - - delete - - get - - list - - watch - - apiGroups: - - batch - resources: - - jobs/status - verbs: - - get - - apiGroups: - - jobset.x-k8s.io - resources: - - jobsets - verbs: - - create - - delete - - get - - list - - watch - - apiGroups: - - jobset.x-k8s.io - resources: - - jobsets/status - verbs: - - get - - apiGroups: - - kueue.x-k8s.io - resources: - - workloads - verbs: - - create - - delete - - get - - list - - watch - - apiGroups: - - kueue.x-k8s.io - resources: - - workloads/status - verbs: - - get - - patch - - update - clusterRoleBinding: - subject: - kind: ServiceAccount - name: kueue-admin-cluster1 - namespace: open-cluster-management-agent-addon diff --git a/solutions/kueue-admission-check/env/cp-c2.yaml b/solutions/kueue-admission-check/env/cp-c2.yaml deleted file mode 100644 index 6199444b5..000000000 --- a/solutions/kueue-admission-check/env/cp-c2.yaml +++ /dev/null @@ -1,63 +0,0 @@ -apiVersion: rbac.open-cluster-management.io/v1alpha1 -kind: ClusterPermission -metadata: - name: kueue-admin-cluster2 - namespace: cluster2 -spec: - clusterRole: - rules: - - apiGroups: - - batch - resources: - - jobs - verbs: - - create - - delete - - get - - list - - watch - - apiGroups: - - batch - resources: - - jobs/status - verbs: - - get - - apiGroups: - - jobset.x-k8s.io - resources: - - jobsets - verbs: - - create - - delete - - get - - list - - watch - - apiGroups: - - jobset.x-k8s.io - resources: - - jobsets/status - verbs: - - get - - apiGroups: - - kueue.x-k8s.io - resources: - - workloads - verbs: - - create - - delete - - get - - list - - watch - - apiGroups: - - kueue.x-k8s.io - resources: - - workloads/status - verbs: - - get - - patch - - update - clusterRoleBinding: - subject: - kind: ServiceAccount - name: kueue-admin-cluster2 - namespace: open-cluster-management-agent-addon diff --git a/solutions/kueue-admission-check/env/cp-c3.yaml b/solutions/kueue-admission-check/env/cp-c3.yaml deleted file mode 100644 index 842d9480f..000000000 --- a/solutions/kueue-admission-check/env/cp-c3.yaml +++ /dev/null @@ -1,63 +0,0 @@ -apiVersion: rbac.open-cluster-management.io/v1alpha1 -kind: ClusterPermission -metadata: - name: kueue-admin-cluster3 - namespace: cluster3 -spec: - clusterRole: - rules: - - apiGroups: - - batch - resources: - - jobs - verbs: - - create - - delete - - get - - list - - watch - - apiGroups: - - batch - resources: - - jobs/status - verbs: - - get - - apiGroups: - - jobset.x-k8s.io - resources: - - jobsets - verbs: - - create - - delete - - get - - list - - watch - - apiGroups: - - jobset.x-k8s.io - resources: - - jobsets/status - verbs: - - get - - apiGroups: - - kueue.x-k8s.io - resources: - - workloads - verbs: - - create - - delete - - get - - list - - watch - - apiGroups: - - kueue.x-k8s.io - resources: - - workloads/status - verbs: - - get - - patch - - update - clusterRoleBinding: - subject: - kind: ServiceAccount - name: kueue-admin-cluster3 - namespace: open-cluster-management-agent-addon diff --git a/solutions/kueue-admission-check/env/msa-c1.yaml b/solutions/kueue-admission-check/env/msa-c1.yaml deleted file mode 100644 index b7466e992..000000000 --- a/solutions/kueue-admission-check/env/msa-c1.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: authentication.open-cluster-management.io/v1beta1 -kind: ManagedServiceAccount -metadata: - name: kueue-admin-cluster1 - namespace: cluster1 -spec: - rotation: {} diff --git a/solutions/kueue-admission-check/env/msa-c3.yaml b/solutions/kueue-admission-check/env/msa-c3.yaml deleted file mode 100644 index d9f8046e6..000000000 --- a/solutions/kueue-admission-check/env/msa-c3.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: authentication.open-cluster-management.io/v1beta1 -kind: ManagedServiceAccount -metadata: - name: kueue-admin-cluster3 - namespace: cluster3 -spec: - rotation: {} diff --git a/solutions/kueue-admission-check/env/msa-c2.yaml b/solutions/kueue-admission-check/env/msa.yaml similarity index 67% rename from solutions/kueue-admission-check/env/msa-c2.yaml rename to solutions/kueue-admission-check/env/msa.yaml index 91971cdfd..14db974a9 100644 --- a/solutions/kueue-admission-check/env/msa-c2.yaml +++ b/solutions/kueue-admission-check/env/msa.yaml @@ -1,7 +1,7 @@ apiVersion: authentication.open-cluster-management.io/v1beta1 kind: ManagedServiceAccount metadata: - name: kueue-admin-cluster2 - namespace: cluster2 + name: kueue-admin-CLUSTER_NAME + namespace: CLUSTER_NAME spec: rotation: {} diff --git a/solutions/kueue-admission-check/env/patch-clusterrole.json b/solutions/kueue-admission-check/env/patch-clusterrole.json index 0d876009d..f01b1b872 100644 --- a/solutions/kueue-admission-check/env/patch-clusterrole.json +++ b/solutions/kueue-admission-check/env/patch-clusterrole.json @@ -1,22 +1,4 @@ [ - { - "op": "add", - "path": "/rules/-", - "value": { - "apiGroups": ["multicluster.x-k8s.io"], - "resources": ["clusterprofiles"], - "verbs": ["get", "list", "watch", "create", "update", "patch", "delete"] - } - }, - { - "op": "add", - "path": "/rules/-", - "value": { - "apiGroups": ["multicluster.x-k8s.io"], - "resources": ["clusterprofiles/status"], - "verbs": ["update", "patch"] - } - }, { "op": "add", "path": "/rules/-", diff --git a/solutions/kueue-admission-check/multikueue-setup-demo1.yaml b/solutions/kueue-admission-check/multikueue-setup-demo1.yaml index 3d4888c03..11a1b4d7b 100644 --- a/solutions/kueue-admission-check/multikueue-setup-demo1.yaml +++ b/solutions/kueue-admission-check/multikueue-setup-demo1.yaml @@ -40,7 +40,7 @@ spec: kind: MultiKueueConfig name: multikueue-config-demo1 --- -apiVersion: kueue.x-k8s.io/v1alpha1 +apiVersion: kueue.x-k8s.io/v1beta1 kind: MultiKueueConfig metadata: name: multikueue-config-demo1 @@ -49,7 +49,7 @@ spec: - multikueue-demo1-cluster1 - multikueue-demo1-cluster2 --- -apiVersion: kueue.x-k8s.io/v1alpha1 +apiVersion: kueue.x-k8s.io/v1beta1 kind: MultiKueueCluster metadata: name: multikueue-demo1-cluster1 @@ -61,7 +61,7 @@ spec: # controller manager runs into, holding the kubeConfig needed to connect to the # worker cluster in the "kubeconfig" key; --- -apiVersion: kueue.x-k8s.io/v1alpha1 +apiVersion: kueue.x-k8s.io/v1beta1 kind: MultiKueueCluster metadata: name: multikueue-demo1-cluster2 diff --git a/solutions/kueue-admission-check/setup-env.sh b/solutions/kueue-admission-check/setup-env.sh index 2ef680a86..574f2511a 100755 --- a/solutions/kueue-admission-check/setup-env.sh +++ b/solutions/kueue-admission-check/setup-env.sh @@ -14,58 +14,69 @@ c1ctx="kind-${c1}" c2ctx="kind-${c2}" c3ctx="kind-${c3}" -kind create cluster --name "${hub}" --image kindest/node:v1.29.0@sha256:eaa1450915475849a73a9227b8f201df25e55e268e5d619312131292e324d570 -kind create cluster --name "${c1}" --image kindest/node:v1.29.0@sha256:eaa1450915475849a73a9227b8f201df25e55e268e5d619312131292e324d570 -kind create cluster --name "${c2}" --image kindest/node:v1.29.0@sha256:eaa1450915475849a73a9227b8f201df25e55e268e5d619312131292e324d570 -kind create cluster --name "${c3}" --image kindest/node:v1.29.0@sha256:eaa1450915475849a73a9227b8f201df25e55e268e5d619312131292e324d570 - -echo "Initialize the ocm hub cluster" - -clusteradm init --feature-gates="ManifestWorkReplicaSet=true,ManagedClusterAutoApproval=true" --bundle-version="latest" --wait --context ${hubctx} +spoke_clusters=(${c1} ${c2} ${c3}) +all_clusters=(${hub} ${spoke_clusters[@]}) +spoke_ctx=(${c1ctx} ${c2ctx} ${c3ctx}) +all_ctx=(${hubctx} ${spoke_ctx[@]}) + +kueue_manifest="https://github.com/kubernetes-sigs/kueue/releases/download/v0.9.1/manifests.yaml" +jobset_manifest="https://github.com/kubernetes-sigs/jobset/releases/download/v0.7.1/manifests.yaml" +mpi_operator_manifest="https://github.com/kubeflow/mpi-operator/releases/download/v0.6.0/mpi-operator.yaml" +training_operator_kustomize="github.com/kubeflow/training-operator.git/manifests/overlays/standalone?ref=v1.8.1" + +# ocm setup +echo "Parepare kind clusters" +for cluster in "${all_clusters[@]}"; do + kind create cluster --name "$cluster" --image kindest/node:v1.29.0 +done + +echo "Initialize the ocm hub cluster with ClusterProfile enabled" +clusteradm init --feature-gates="ManifestWorkReplicaSet=true,ManagedClusterAutoApproval=true,ClusterProfile=true" --bundle-version="v0.15.0" --wait --context ${hubctx} joincmd=$(clusteradm get token --context ${hubctx} | grep clusteradm) -echo "Join cluster1 to hub" +echo "Join clusters to hub" $(echo ${joincmd} --force-internal-endpoint-lookup --wait --context ${c1ctx} | sed "s//$c1/g") - -echo "Join cluster2 to hub" $(echo ${joincmd} --force-internal-endpoint-lookup --wait --context ${c2ctx} | sed "s//$c2/g") - -echo "Join cluster3 to hub" $(echo ${joincmd} --force-internal-endpoint-lookup --wait --context ${c3ctx} | sed "s//$c3/g") -echo "Accept join of cluster1 and cluster2" +echo "Accept join of clusters" clusteradm accept --context ${hubctx} --clusters ${c1},${c2},${c3} --wait kubectl get managedclusters --all-namespaces --context ${hubctx} -echo "Install Kueue (this can be replaced with OCM Manifestwork in the future)" -kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.7.1/manifests.yaml --context ${hubctx} -kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.7.1/manifests.yaml --context ${c1ctx} -kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.7.1/manifests.yaml --context ${c2ctx} -kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.7.1/manifests.yaml --context ${c3ctx} +# install kueue, jobset, workflow +for ctx in "${all_ctx[@]}"; do + echo "Install Kueue, Jobset on $ctx" + kubectl apply --server-side -f "$kueue_manifest" --context "$ctx" + kubectl apply --server-side -f "$jobset_manifest" --context "$ctx" +done -echo "Install Jobset for MultiKueue (this can be replaced with OCM Manifestwork in the future)" -kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.5.2/manifests.yaml --context ${hubctx} -kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.5.2/manifests.yaml --context ${c1ctx} -kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.5.2/manifests.yaml --context ${c2ctx} -kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.5.2/manifests.yaml --context ${c3ctx} +for ctx in "${spoke_ctx[@]}"; do + echo "Install Kubeflow MPI Operator, Training Operator on $ctx" + kubectl apply --server-side -f "$mpi_operator_manifest" --context "$ctx" || true + kubectl apply --server-side -k "$training_operator_kustomize" --context "$ctx" || true +done kubectl config use-context ${hubctx} - +# patch some ocm resoures and images echo "Patch permission" kubectl patch clusterrole cluster-manager --type='json' -p "$(cat env/patch-clusterrole.json)" echo "Patch image" +# quay.io/haoqing/registration-operator:kueue-v0.9.1 grants more permission for registration and placement. +# quay.io/haoqing/registration-operator:kueue-v0.9.1 creates worker’s kubeconfig secret for multikueue. +# quay.io/haoqing/placement:kueue-v0.9.1 implements the admission check controller. +# The source code is in repo https://github.com/haoqing0110/OCM/tree/br_ocm-v0.15.1-kueue-v0.9.1. kubectl patch deployment cluster-manager -n open-cluster-management --type=json -p='[ - {"op": "replace", "path": "/spec/template/spec/containers/0/image", "value": "quay.io/haoqing/registration-operator:latest"}, + {"op": "replace", "path": "/spec/template/spec/containers/0/image", "value": "quay.io/haoqing/registration-operator:kueue-v0.9.1"}, {"op": "replace", "path": "/spec/template/spec/containers/0/imagePullPolicy", "value": "Always"} ]' -kubectl patch clustermanager cluster-manager --type=json -p='[{"op": "replace", "path": "/spec/registrationImagePullSpec", "value": "quay.io/haoqing/registration:latest"}]' -kubectl patch clustermanager cluster-manager --type=json -p='[{"op": "replace", "path": "/spec/placementImagePullSpec", "value": "quay.io/haoqing/placement:latest"}]' - -echo "Install CRDs" -kubectl create -f env/multicluster.x-k8s.io_clusterprofiles.yaml +kubectl patch clustermanager cluster-manager --type=json -p='[ + {"op": "replace", "path": "/spec/registrationImagePullSpec", "value": "quay.io/haoqing/registration:kueue-v0.9.1"}, + {"op": "replace", "path": "/spec/placementImagePullSpec", "value": "quay.io/haoqing/placement:kueue-v0.9.1"} +]' +# install addons echo "Install managed-serviceaccount" git clone git@github.com:open-cluster-management-io/managed-serviceaccount.git || true cd managed-serviceaccount @@ -103,19 +114,15 @@ make deploy cd - rm -rf addon-contrib -echo "Enable MultiKueue on the hub" -kubectl patch deployment kueue-controller-manager -n kueue-system --type='json' -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/args", "value": ["--config=/controller_manager_config.yaml", "--zap-log-level=2", "--feature-gates=MultiKueue=true"]}]' - +# prepare credentials for multikueue echo "Setup queue on the spoke" kubectl apply -f env/single-clusterqueue-setup-mwrs.yaml echo "Setup credentials for clusterprofile" -kubectl apply -f env/cp-c1.yaml -kubectl apply -f env/cp-c2.yaml -kubectl apply -f env/cp-c3.yaml -kubectl apply -f env/msa-c1.yaml -kubectl apply -f env/msa-c2.yaml -kubectl apply -f env/msa-c3.yaml +for CLUSTER in "${spoke_clusters[@]}"; do + sed "s/CLUSTER_NAME/$CLUSTER/g" env/clusterpermission.yaml | kubectl apply -f - + sed "s/CLUSTER_NAME/$CLUSTER/g" env/msa.yaml | kubectl apply -f - +done echo "Setup faked GPU on the spoke" kubectl label managedcluster cluster2 accelerator=nvidia-tesla-t4