From 7844680f3b60d4c4c6c8133b4a32db2dce28849c Mon Sep 17 00:00:00 2001
From: Shereen Haj <shajmakh@redhat.com>
Date: Mon, 12 Aug 2024 17:30:48 +0300
Subject: [PATCH] sched: controller: set scheduler priority

So far the scheduler priority is set to default which is 0 this is risky
especially when the preemtion of pods is needed to fit more important pods.

The NRS is important enough to deserve the most critical priority class
system-node-critical which is the same priority for the
kube-scheduler. We need this priority set always regardless how many
replicas are set for the scheduler, and especially if we look to
optimize the HA of the scheduler.

We choose system-node-critical over system-cluster-critical because we don't want to allow SS preemption by higher-priority pods. If it was set to system-cluster-critical and an event is triggered that requires pod eviction, which would be for scheduling system-node-critical workloads, the SS would be at risk of being evicted. although this would be very rare and the evicted pod will be rescheduled, there is no convincing reason not to make it node-critical.

addresses https://github.com/openshift-kni/numaresources-operator/issues/974

Signed-off-by: Shereen Haj <shajmakh@redhat.com>
---
 controllers/numaresourcesscheduler_controller.go |  4 ++++
 .../numaresourcesscheduler_controller_test.go    | 16 ++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/controllers/numaresourcesscheduler_controller.go b/controllers/numaresourcesscheduler_controller.go
index 8f28675c9..cbcace1a1 100644
--- a/controllers/numaresourcesscheduler_controller.go
+++ b/controllers/numaresourcesscheduler_controller.go
@@ -56,6 +56,7 @@ import (
 
 const (
 	leaderElectionResourceName = "numa-scheduler-leader"
+	schedulerPriorityClassName = "system-node-critical"
 )
 
 const (
@@ -217,6 +218,9 @@ func (r *NUMAResourcesSchedulerReconciler) syncNUMASchedulerResources(ctx contex
 	// TODO: if replicas doesn't make sense (autodetect disabled and user set impossible value) then we
 	// should set a degraded state
 
+	// node-critical so the pod won't be preempted by pods having the most critical priority class
+	r.SchedulerManifests.Deployment.Spec.Template.Spec.PriorityClassName = schedulerPriorityClassName
+
 	schedupdate.DeploymentImageSettings(r.SchedulerManifests.Deployment, schedSpec.SchedulerImage)
 	cmHash := hash.ConfigMapData(r.SchedulerManifests.ConfigMap)
 	schedupdate.DeploymentConfigMapSettings(r.SchedulerManifests.Deployment, r.SchedulerManifests.ConfigMap.Name, cmHash)
diff --git a/controllers/numaresourcesscheduler_controller_test.go b/controllers/numaresourcesscheduler_controller_test.go
index 326495a60..b8c700fc1 100644
--- a/controllers/numaresourcesscheduler_controller_test.go
+++ b/controllers/numaresourcesscheduler_controller_test.go
@@ -208,6 +208,22 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() {
 			gomega.Expect(nrs.Status.CacheResyncPeriod.Seconds()).To(gomega.Equal(resyncPeriod.Seconds()))
 		})
 
+		ginkgo.It("should have the correct priority class", func() {
+			key := client.ObjectKeyFromObject(nrs)
+			_, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key})
+			gomega.Expect(err).ToNot(gomega.HaveOccurred())
+
+			key = client.ObjectKey{
+				Name:      "secondary-scheduler",
+				Namespace: testNamespace,
+			}
+
+			dp := &appsv1.Deployment{}
+			gomega.Expect(reconciler.Client.Get(context.TODO(), key, dp)).ToNot(gomega.HaveOccurred())
+
+			gomega.Expect(dp.Spec.Template.Spec.PriorityClassName).To(gomega.BeEquivalentTo(schedulerPriorityClassName))
+		})
+
 		ginkgo.It("should have a config hash annotation under deployment", func() {
 			key := client.ObjectKeyFromObject(nrs)
 			_, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key})