From 7d2b66b6f28e9ea415d5778135edfca277cc5d17 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sun, 1 Sep 2024 12:06:04 -0600 Subject: [PATCH 1/5] version: update builds and ci to go 1.22 Problem: the upstream scheduler-plugins is now using 1.22.0. Solution: we should do the same. Signed-off-by: vsoch --- .github/workflows/build-deploy.yaml | 6 +++--- .github/workflows/e2e-test.yaml | 6 +++--- .github/workflows/test.yaml | 2 +- Makefile | 2 +- .../scheduling/v1alpha1/podgroup_webhook.go | 20 +++++++++---------- src/build/scheduler/Dockerfile | 2 +- src/fluence/go.mod | 2 +- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/build-deploy.yaml b/.github/workflows/build-deploy.yaml index b5b9db7..e2f7abd 100644 --- a/.github/workflows/build-deploy.yaml +++ b/.github/workflows/build-deploy.yaml @@ -20,7 +20,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v4 with: - go-version: ^1.21 + go-version: ^1.22 - name: Build Containers run: | @@ -57,7 +57,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v4 with: - go-version: ^1.21 + go-version: ^1.22 - name: Build Containers run: | @@ -94,7 +94,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v4 with: - go-version: ^1.21 + go-version: ^1.22 - name: Build Container run: | diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml index 0832080..741be75 100644 --- a/.github/workflows/e2e-test.yaml +++ b/.github/workflows/e2e-test.yaml @@ -23,7 +23,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v4 with: - go-version: ^1.21 + go-version: ^1.22 - name: Build Containers run: | @@ -56,7 +56,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v4 with: - go-version: ^1.21 + go-version: ^1.22 - name: Build Container run: | @@ -87,7 +87,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v4 with: - go-version: ^1.21 + go-version: ^1.22 - name: Download fluence artifact uses: actions/download-artifact@v4 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 593d1a0..c6c9b22 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -11,7 +11,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-go@v4 with: - go-version: ^1.21 + go-version: ^1.22 - name: Run Tests run: | diff --git a/Makefile b/Makefile index 33a5ab1..e09871c 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ PLATFORMS ?= linux/amd64 BUILDER ?= docker # We match this to the fluence build (see src/build/scheduler/Dockerfile) -GO_VERSION ?= 1.21.9 +GO_VERSION ?= 1.22.0 GO_BASE_IMAGE ?= golang:${GO_VERSION} DISTROLESS_BASE_IMAGE ?= gcr.io/distroless/static:nonroot diff --git a/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go b/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go index 7266d85..64bac65 100644 --- a/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go +++ b/sig-scheduler-plugins/apis/scheduling/v1alpha1/podgroup_webhook.go @@ -38,19 +38,19 @@ var ( // NewMutatingWebhook allows us to keep the sidecarInjector private // If it's public it's exported and kubebuilder tries to add to zz_generated_deepcopy // and you get all kinds of terrible errors about admission.Decoder missing DeepCopyInto -func NewMutatingWebhook(mgr manager.Manager) *fluenceWatcher { - return &fluenceWatcher{decoder: admission.NewDecoder(mgr.GetScheme())} +func NewMutatingWebhook(mgr manager.Manager) fluenceWatcher { + return fluenceWatcher{decoder: admission.NewDecoder(mgr.GetScheme())} } // mutate-v1-fluence type fluenceWatcher struct { - decoder *admission.Decoder + decoder admission.Decoder } // Handle is the main handler for the webhook, which is looking for jobs and pods (in that order) // If a job comes in (with a pod template) first, we add the labels there first (and they will // not be added again). -func (hook *fluenceWatcher) Handle(ctx context.Context, req admission.Request) admission.Response { +func (hook fluenceWatcher) Handle(ctx context.Context, req admission.Request) admission.Response { logger.Info("Running webhook handle, determining pod wrapper abstraction...") @@ -145,7 +145,7 @@ func (hook *fluenceWatcher) Handle(ctx context.Context, req admission.Request) a } // Default is the expected entrypoint for a webhook... -func (hook *fluenceWatcher) Default(ctx context.Context, obj runtime.Object) error { +func (hook fluenceWatcher) Default(ctx context.Context, obj runtime.Object) error { switch obj.(type) { case *batchv1.Job: @@ -179,7 +179,7 @@ func (hook *fluenceWatcher) Default(ctx context.Context, obj runtime.Object) err // Note that we need to do similar for Job. // A pod without a job wrapper, and without metadata is a group // of size 1. -func (hook *fluenceWatcher) EnsureGroup(pod *corev1.Pod) error { +func (hook fluenceWatcher) EnsureGroup(pod *corev1.Pod) error { // Add labels if we don't have anything. Everything is a group! if pod.Labels == nil { @@ -221,7 +221,7 @@ func getJobLabel(job *batchv1.Job, labelName, defaultLabel string) string { // EnsureGroupOnJob looks for fluence labels (size and name) on both the job // and the pod template. We ultimately put on the pod, the lowest level unit. // Since we have the size of the job (parallelism) we can use that for the size -func (a *fluenceWatcher) EnsureGroupOnJob(job *batchv1.Job) error { +func (a fluenceWatcher) EnsureGroupOnJob(job *batchv1.Job) error { // Be forgiving - allow the person to specify it on the job directly or on the Podtemplate // We will ultimately put the metadata on the Pod. @@ -251,7 +251,7 @@ func (a *fluenceWatcher) EnsureGroupOnJob(job *batchv1.Job) error { } // EnsureGroupStatefulSet creates a PodGroup for a StatefulSet -func (hook *fluenceWatcher) EnsureGroupStatefulSet(set *appsv1.StatefulSet) error { +func (hook fluenceWatcher) EnsureGroupStatefulSet(set *appsv1.StatefulSet) error { // StatefulSet requires on top level explicitly if set.Labels == nil { @@ -279,7 +279,7 @@ func (hook *fluenceWatcher) EnsureGroupStatefulSet(set *appsv1.StatefulSet) erro } // EnsureGroupStatefulSet creates a PodGroup for a StatefulSet -func (a *fluenceWatcher) EnsureGroupReplicaSet(set *appsv1.ReplicaSet) error { +func (a fluenceWatcher) EnsureGroupReplicaSet(set *appsv1.ReplicaSet) error { // StatefulSet requires on top level explicitly if set.Labels == nil { @@ -308,7 +308,7 @@ func (a *fluenceWatcher) EnsureGroupReplicaSet(set *appsv1.ReplicaSet) error { // EnsureGroupDeployment creates a PodGroup for a Deployment // This is redundant, can refactor later -func (a *fluenceWatcher) EnsureGroupDeployment(d *appsv1.Deployment) error { +func (a fluenceWatcher) EnsureGroupDeployment(d *appsv1.Deployment) error { // StatefulSet requires on top level explicitly if d.Labels == nil { diff --git a/src/build/scheduler/Dockerfile b/src/build/scheduler/Dockerfile index b39a141..fe04771 100644 --- a/src/build/scheduler/Dockerfile +++ b/src/build/scheduler/Dockerfile @@ -2,7 +2,7 @@ FROM fluxrm/flux-sched:jammy USER root ENV DEBIAN_FRONTEND=noninteractive -ENV GO_VERSION=1.21.9 +ENV GO_VERSION=1.22.0 RUN apt-get update && apt-get clean -y && apt -y autoremove diff --git a/src/fluence/go.mod b/src/fluence/go.mod index 31228d9..a8a54ed 100644 --- a/src/fluence/go.mod +++ b/src/fluence/go.mod @@ -1,6 +1,6 @@ module github.com/flux-framework/flux-k8s/flux-plugin/fluence -go 1.21 +go 1.22 require ( github.com/flux-framework/fluxion-go v0.32.1-0.20240420052153-909523c84ca2 From 5132ea8725a05463e2ff62caf2d1c3178225ea4a Mon Sep 17 00:00:00 2001 From: vsoch Date: Sun, 1 Sep 2024 12:10:54 -0600 Subject: [PATCH 2/5] sidecar: boost system no longer provided or required Problem: boost_system is no longer required (or built into the base image) Solution: remove it. Signed-off-by: vsoch --- src/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Makefile b/src/Makefile index e31c8ec..8ab6b1f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -4,8 +4,7 @@ INSTALL_PREFIX ?= /usr LIB_PREFIX ?= /usr/lib LOCALBIN ?= $(shell pwd)/bin COMMONENVVAR=GOOS=$(shell uname -s | tr A-Z a-z) -#BUILDENVVAR=CGO_CFLAGS="-I${FLUX_SCHED_ROOT}/resource/reapi/bindings/c" CGO_LDFLAGS="-L${INSTALL_PREFIX}/lib -L${FLUX_SCHED_ROOT}/resource -lresource -L${FLUX_SCHED_ROOT}/resource/libjobspec -ljobspec_conv -L/${FLUX_SCHED_ROOT}/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -ljansson -lhwloc -lboost_system -lflux-hostlist -lboost_graph -lyaml-cpp" -BUILDENVVAR=CGO_CFLAGS="-I${FLUX_SCHED_ROOT} -I${FLUX_SCHED_ROOT}/resource/reapi/bindings/c" CGO_LDFLAGS="-L${LIB_PREFIX} -L${LIB_PREFIX}/flux -L${FLUX_SCHED_ROOT}/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -ljansson -lhwloc -lboost_system -lflux-hostlist -lboost_graph -lyaml-cpp" +BUILDENVVAR=CGO_CFLAGS="-I${FLUX_SCHED_ROOT} -I${FLUX_SCHED_ROOT}/resource/reapi/bindings/c" CGO_LDFLAGS="-L${LIB_PREFIX} -L${LIB_PREFIX}/flux -L${FLUX_SCHED_ROOT}/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -ljansson -lhwloc -lflux-hostlist -lboost_graph -lyaml-cpp" LOCAL_REGISTRY=localhost:5000 From de4e706cc11c01f17d1d3fb96c1bfd51b4f4cf54 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sun, 1 Sep 2024 12:46:09 -0600 Subject: [PATCH 3/5] ci: give more time for pods to ready Signed-off-by: vsoch --- .github/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/test.sh b/.github/test.sh index 2b8b1e6..e30cad5 100755 --- a/.github/test.sh +++ b/.github/test.sh @@ -55,7 +55,7 @@ default_job_pod=$(kubectl get pods --selector=job-name=default-job -o json | jq echo echo "Fluence job pod is ${fluence_job_pod}" echo "Default job pod is ${default_job_pod}" -sleep 10 +sleep 30 # Shared function to check output function check_output { From 0eda0d5c70b9b0bce4e49b3428fdd7f953e455cc Mon Sep 17 00:00:00 2001 From: vsoch Date: Sun, 1 Sep 2024 22:47:21 -0600 Subject: [PATCH 4/5] build: move build logic into Makefile The hack build images is changing too often to make it easy to maintain, and we anticipated this. Instead we can keep the build logic in our Makefile. Signed-off-by: vsoch --- Makefile | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index e09871c..ad9b370 100644 --- a/Makefile +++ b/Makefile @@ -54,12 +54,20 @@ prepare: clone cp sig-scheduler-plugins/apis/scheduling/v1alpha1/*.go $(CLONE_UPSTREAM)/apis/scheduling/v1alpha1/ cp sig-scheduler-plugins/cmd/controller/app/server.go $(CLONE_UPSTREAM)/cmd/controller/app/server.go +# This logic was moved from upstream/hack/build-images.sh - too much changing logic +# and became hard to maintain build: prepare echo ${GO_BASE_IMAGE} - BUILDER=${BUILDER} PLATFORMS=${PLATFORMS} REGISTRY=${REGISTRY} IMAGE=${SCHEDULER_IMAGE} \ - CONTROLLER_IMAGE=${CONTROLLER_IMAGE} RELEASE_VERSION=${RELEASE_VERSION} \ - GO_BASE_IMAGE=${GO_BASE_IMAGE} DISTROLESS_BASE_IMAGE=${DISTROLESS_BASE_IMAGE} \ - $(BASH) $(CLONE_UPSTREAM)/hack/build-images.sh + + docker build -f $(CLONE_UPSTREAM)/build/scheduler/Dockerfile --build-arg RELEASE_VERSION=${RELEASE_VERSION} \ + --build-arg GO_BASE_IMAGE=${GO_BASE_IMAGE} \ + --build-arg DISTROLESS_BASE_IMAGE=${DISTROLESS_BASE_IMAGE} \ + --build-arg CGO_ENABLED=0 -t ${REGISTRY}/${SCHEDULER_IMAGE} $(CLONE_UPSTREAM) + + docker build -f $(CLONE_UPSTREAM)/build/controller/Dockerfile --build-arg RELEASE_VERSION=${RELEASE_VERSION} \ + --build-arg GO_BASE_IMAGE=${GO_BASE_IMAGE} \ + --build-arg DISTROLESS_BASE_IMAGE=${DISTROLESS_BASE_IMAGE} \ + --build-arg CGO_ENABLED=0 -t ${REGISTRY}/${CONTROLLER_IMAGE} $(CLONE_UPSTREAM) push-sidecar: $(DOCKER) push $(REGISTRY)/$(SIDECAR_IMAGE):$(TAG) --all-tags From 8ca051fecd26a7feac2ae343348c05a5eb2b5457 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sun, 1 Sep 2024 23:04:59 -0600 Subject: [PATCH 5/5] bug: manifest path for scheduler deployment There was a change upstream that switched the kube-scheduler back to being in bin (in the Dockerfile) but the corresponding manifest was not updated. Signed-off-by: vsoch --- .github/test.sh | 2 +- .../charts/as-a-second-scheduler/templates/deployment.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/test.sh b/.github/test.sh index e30cad5..2b8b1e6 100755 --- a/.github/test.sh +++ b/.github/test.sh @@ -55,7 +55,7 @@ default_job_pod=$(kubectl get pods --selector=job-name=default-job -o json | jq echo echo "Fluence job pod is ${fluence_job_pod}" echo "Default job pod is ${default_job_pod}" -sleep 30 +sleep 10 # Shared function to check output function check_output { diff --git a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml index 7880c17..289a0e5 100644 --- a/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml +++ b/sig-scheduler-plugins/manifests/install/charts/as-a-second-scheduler/templates/deployment.yaml @@ -67,7 +67,7 @@ spec: - containerPort: {{ .Values.scheduler.port }} hostPort: {{ .Values.scheduler.port }}{{ end }} - command: - - /kube-scheduler + - /bin/kube-scheduler - --config=/etc/kubernetes/scheduler-config.yaml - -v={{ .Values.scheduler.loggingLevel }} image: {{ .Values.scheduler.image }}