From 28be721a9d3e211dda609bd1db6b5ad64792257a Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Tue, 8 Oct 2024 19:44:06 +0200 Subject: [PATCH 01/10] ctrl: nrop: updateStatusConditionsIfNeeded in the happy path Inline status update in the happy path, if the reconciliation loop completed all the expected steps. This is a intermediate step towards the final cleanup, and should cause no changes in behavior. Signed-off-by: Francesco Romani --- .../numaresourcesoperator_controller.go | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/controllers/numaresourcesoperator_controller.go b/controllers/numaresourcesoperator_controller.go index 061e8ec34..fd78449df 100644 --- a/controllers/numaresourcesoperator_controller.go +++ b/controllers/numaresourcesoperator_controller.go @@ -164,18 +164,27 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr } result, condition, err := r.reconcileResource(ctx, instance, trees) - if condition != "" { - if condition == status.ConditionAvailable && multiMCPsErr != nil { - _, _ = r.updateStatus(ctx, instance, status.ConditionDegraded, validation.NodeGroupsError, multiMCPsErr) - } else { - _, _ = r.updateStatus(ctx, instance, condition, reasonFromError(err), err) - } + + _ = updateStatusConditionsIfNeeded(instance, condition, reasonFromError(err), messageFromError(err)) + + if condition == status.ConditionAvailable && multiMCPsErr != nil { + _, _ = r.updateStatus(ctx, instance, status.ConditionDegraded, validation.NodeGroupsError, multiMCPsErr) } + + updErr := r.Client.Status().Update(ctx, instance) + if updErr != nil { + klog.InfoS("Failed to update numaresourcesoperator status", "error", updErr) + return ctrl.Result{}, fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), updErr) + } + return result, err } // updateStatusConditionsIfNeeded returns true if conditions were updated. func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, condition string, reason string, message string) bool { + if condition == "" { + return false + } klog.InfoS("updateStatus", "condition", condition, "reason", reason, "message", message) conditions, ok := status.UpdateConditions(instance.Status.Conditions, condition, reason, message) if ok { From 5d0a626074a1074a14df07bfe3a550d1dd01c88d Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Tue, 8 Oct 2024 20:13:30 +0200 Subject: [PATCH 02/10] ctrl: nrop: updateStatus -> degradeStatus Signed-off-by: Francesco Romani --- controllers/numaresourcesoperator_controller.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/controllers/numaresourcesoperator_controller.go b/controllers/numaresourcesoperator_controller.go index fd78449df..8c2deaccd 100644 --- a/controllers/numaresourcesoperator_controller.go +++ b/controllers/numaresourcesoperator_controller.go @@ -140,22 +140,22 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr if req.Name != objectnames.DefaultNUMAResourcesOperatorCrName { err := fmt.Errorf("incorrect NUMAResourcesOperator resource name: %s", instance.Name) - return r.updateStatus(ctx, instance, status.ConditionDegraded, status.ConditionTypeIncorrectNUMAResourcesOperatorResourceName, err) + return r.degradeStatus(ctx, instance, status.ConditionTypeIncorrectNUMAResourcesOperatorResourceName, err) } if err := validation.NodeGroups(instance.Spec.NodeGroups); err != nil { - return r.updateStatus(ctx, instance, status.ConditionDegraded, validation.NodeGroupsError, err) + return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err) } trees, err := getTreesByNodeGroup(ctx, r.Client, instance.Spec.NodeGroups) if err != nil { - return r.updateStatus(ctx, instance, status.ConditionDegraded, validation.NodeGroupsError, err) + return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err) } multiMCPsErr := validation.MultipleMCPsPerTree(instance.Annotations, trees) if err := validation.MachineConfigPoolDuplicates(trees); err != nil { - return r.updateStatus(ctx, instance, status.ConditionDegraded, validation.NodeGroupsError, err) + return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err) } for idx := range trees { @@ -168,7 +168,7 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr _ = updateStatusConditionsIfNeeded(instance, condition, reasonFromError(err), messageFromError(err)) if condition == status.ConditionAvailable && multiMCPsErr != nil { - _, _ = r.updateStatus(ctx, instance, status.ConditionDegraded, validation.NodeGroupsError, multiMCPsErr) + _, _ = r.degradeStatus(ctx, instance, validation.NodeGroupsError, multiMCPsErr) } updErr := r.Client.Status().Update(ctx, instance) @@ -193,10 +193,11 @@ func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, cond return ok } -func (r *NUMAResourcesOperatorReconciler) updateStatus(ctx context.Context, instance *nropv1.NUMAResourcesOperator, condition string, reason string, stErr error) (ctrl.Result, error) { +func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, instance *nropv1.NUMAResourcesOperator, reason string, stErr error) (ctrl.Result, error) { message := messageFromError(stErr) - _ = updateStatusConditionsIfNeeded(instance, condition, reason, message) + _ = updateStatusConditionsIfNeeded(instance, status.ConditionDegraded, reason, message) + // TODO: if we keep being degraded, we likely (= if we don't, it's too implicit) keep sending possibly redundant updates to the apiserver err := r.Client.Status().Update(ctx, instance) if err != nil { From 48b858c1fcc37552d0e1b63e36f940208b501c09 Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Wed, 9 Oct 2024 08:52:10 +0200 Subject: [PATCH 03/10] ctrl: status: nrop: explicit diff check Instead of relying in the chain of helpers to report if something changed in status, thus warrants a update, run a full diff of our status explictely. If something worthy (whose definition depends on the helper implemented in `pkg/status`) changed, then we will push a status update. This is probably slower than relying on the reports from subfunctions, but simplifies and streamlines the code significantly. Signed-off-by: Francesco Romani --- .../numaresourcesoperator_controller.go | 6 ++ pkg/status/status.go | 11 +++ pkg/status/status_test.go | 81 +++++++++++++++++++ 3 files changed, 98 insertions(+) diff --git a/controllers/numaresourcesoperator_controller.go b/controllers/numaresourcesoperator_controller.go index 8c2deaccd..23fd93b7a 100644 --- a/controllers/numaresourcesoperator_controller.go +++ b/controllers/numaresourcesoperator_controller.go @@ -163,6 +163,8 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr trees[idx].NodeGroup.Config = &conf } + curStatus := instance.Status.DeepCopy() + result, condition, err := r.reconcileResource(ctx, instance, trees) _ = updateStatusConditionsIfNeeded(instance, condition, reasonFromError(err), messageFromError(err)) @@ -171,6 +173,10 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr _, _ = r.degradeStatus(ctx, instance, validation.NodeGroupsError, multiMCPsErr) } + if !status.IsUpdatedNUMAResourcesOperator(curStatus, &instance.Status) { + return result, err + } + updErr := r.Client.Status().Update(ctx, instance) if updErr != nil { klog.InfoS("Failed to update numaresourcesoperator status", "error", updErr) diff --git a/pkg/status/status.go b/pkg/status/status.go index 558fea259..3ac16f359 100644 --- a/pkg/status/status.go +++ b/pkg/status/status.go @@ -23,6 +23,8 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + nropv1 "github.com/openshift-kni/numaresources-operator/api/numaresourcesoperator/v1" ) // TODO: are we duping these? @@ -37,6 +39,15 @@ const ( ConditionTypeIncorrectNUMAResourcesOperatorResourceName = "IncorrectNUMAResourcesOperatorResourceName" ) +func IsUpdatedNUMAResourcesOperator(oldStatus, newStatus *nropv1.NUMAResourcesOperatorStatus) bool { + options := []cmp.Option{ + cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime"), + cmpopts.IgnoreFields(metav1.Condition{}, "ObservedGeneration"), + } + + return !cmp.Equal(newStatus, oldStatus, options...) +} + // UpdateConditions compute new conditions based on arguments, and then compare with given current conditions. // Returns the conditions to use, either current or newly computed, and a boolean flag which is `true` if conditions need // update - so if they are updated since the current conditions. diff --git a/pkg/status/status_test.go b/pkg/status/status_test.go index c09e1d91d..2da592168 100644 --- a/pkg/status/status_test.go +++ b/pkg/status/status_test.go @@ -19,6 +19,7 @@ package status import ( "context" "testing" + "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/scheme" @@ -78,3 +79,83 @@ func TestUpdateIfNeeded(t *testing.T) { t.Errorf("Update did change status, but it should not") } } + +func TestIsUpdatedNUMAResourcesOperator(t *testing.T) { + type testCase struct { + name string + oldStatus *nropv1.NUMAResourcesOperatorStatus + updaterFunc func(*nropv1.NUMAResourcesOperatorStatus) + expectedUpdated bool + } + testCases := []testCase{ + { + name: "zero status, no change", + oldStatus: &nropv1.NUMAResourcesOperatorStatus{}, + updaterFunc: func(st *nropv1.NUMAResourcesOperatorStatus) {}, + expectedUpdated: false, + }, + { + name: "status, conditions, updated only time", + oldStatus: &nropv1.NUMAResourcesOperatorStatus{ + Conditions: NewConditions(ConditionAvailable, "test all good", "testing info"), + }, + updaterFunc: func(st *nropv1.NUMAResourcesOperatorStatus) { + time.Sleep(42 * time.Millisecond) // make sure the timestamp changed + st.Conditions = NewConditions(ConditionAvailable, "test all good", "testing info") + }, + expectedUpdated: false, + }, + { + name: "status, conditions, updated only time, other fields changed", + oldStatus: &nropv1.NUMAResourcesOperatorStatus{ + Conditions: NewConditions(ConditionAvailable, "test all good", "testing info"), + }, + updaterFunc: func(st *nropv1.NUMAResourcesOperatorStatus) { + time.Sleep(42 * time.Millisecond) // make sure the timestamp changed + st.Conditions = NewConditions(ConditionAvailable, "test all good", "testing info") + st.DaemonSets = []nropv1.NamespacedName{ + { + Namespace: "foo", + Name: "bar", + }, + } + }, + expectedUpdated: true, + }, + { + name: "status, conditions, updated only time, other fields mutated", + oldStatus: &nropv1.NUMAResourcesOperatorStatus{ + Conditions: NewConditions(ConditionAvailable, "test all good", "testing info"), + DaemonSets: []nropv1.NamespacedName{ + { + Namespace: "foo", + Name: "bar", + }, + }, + }, + updaterFunc: func(st *nropv1.NUMAResourcesOperatorStatus) { + time.Sleep(42 * time.Millisecond) // make sure the timestamp changed + st.Conditions = NewConditions(ConditionAvailable, "test all good", "testing info") + st.DaemonSets = []nropv1.NamespacedName{ + { + Namespace: "foo", + Name: "zap", + }, + } + }, + expectedUpdated: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + oldStatus := tc.oldStatus.DeepCopy() + newStatus := tc.oldStatus.DeepCopy() + tc.updaterFunc(newStatus) + got := IsUpdatedNUMAResourcesOperator(oldStatus, newStatus) + if got != tc.expectedUpdated { + t.Errorf("isUpdated %v expected %v", got, tc.expectedUpdated) + } + }) + } +} From e0a7fa2a1f1e9f97c0be74c9899aaf34fdb00b5c Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Wed, 9 Oct 2024 08:55:17 +0200 Subject: [PATCH 04/10] ctrl: nrop: push condition update in reconcileResources The only reason why we update the status conditions outside reconcileResources, while we update everything else related to status inside, is historical. We are now enabled to close this gap and streamline the code further. Signed-off-by: Francesco Romani --- .../numaresourcesoperator_controller.go | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/controllers/numaresourcesoperator_controller.go b/controllers/numaresourcesoperator_controller.go index 23fd93b7a..3ccabda72 100644 --- a/controllers/numaresourcesoperator_controller.go +++ b/controllers/numaresourcesoperator_controller.go @@ -165,9 +165,7 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr curStatus := instance.Status.DeepCopy() - result, condition, err := r.reconcileResource(ctx, instance, trees) - - _ = updateStatusConditionsIfNeeded(instance, condition, reasonFromError(err), messageFromError(err)) + result, err := r.reconcileResource(ctx, instance, trees) if condition == status.ConditionAvailable && multiMCPsErr != nil { _, _ = r.degradeStatus(ctx, instance, validation.NodeGroupsError, multiMCPsErr) @@ -277,27 +275,31 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context return daemonSetsInfoPerMCP, false, ctrl.Result{}, "", nil } -func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (ctrl.Result, string, error) { +func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (ctrl.Result, error) { if done, res, cond, err := r.reconcileResourceAPI(ctx, instance, trees); done { - return res, cond, err + _ = updateStatusConditionsIfNeeded(instance, cond, reasonFromError(err), messageFromError(err)) + return res, err } if r.Platform == platform.OpenShift { if done, res, cond, err := r.reconcileResourceMachineConfig(ctx, instance, trees); done { - return res, cond, err + _ = updateStatusConditionsIfNeeded(instance, cond, reasonFromError(err), messageFromError(err)) + return res, err } } dsPerMCP, done, res, cond, err := r.reconcileResourceDaemonSet(ctx, instance, trees) if done { - return res, cond, err + _ = updateStatusConditionsIfNeeded(instance, cond, reasonFromError(err), messageFromError(err)) + return res, err } // all fields of NodeGroupStatus are required so publish the status only when all daemonset and MCPs are updated which // is a certain thing if we got to this point otherwise the function would have returned already instance.Status.NodeGroups = syncNodeGroupsStatus(instance, dsPerMCP) - return ctrl.Result{}, status.ConditionAvailable, nil + _ = updateStatusConditionsIfNeeded(instance, status.ConditionAvailable, reasonFromError(nil), messageFromError(nil)) + return ctrl.Result{}, nil } func (r *NUMAResourcesOperatorReconciler) syncDaemonSetsStatuses(ctx context.Context, rd client.Reader, daemonSetsInfo []poolDaemonSet) ([]nropv1.NamespacedName, bool, error) { From e8c7be9297be75af1ee14f60403b2c65a29df6ce Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Mon, 28 Oct 2024 14:05:50 +0100 Subject: [PATCH 05/10] controller: nrop: cleanup: fix updateStatusIfNeeded we never use the return value, good riddance. Signed-off-by: Francesco Romani --- controllers/numaresourcesoperator_controller.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/controllers/numaresourcesoperator_controller.go b/controllers/numaresourcesoperator_controller.go index 3ccabda72..b367d5a61 100644 --- a/controllers/numaresourcesoperator_controller.go +++ b/controllers/numaresourcesoperator_controller.go @@ -185,22 +185,21 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr } // updateStatusConditionsIfNeeded returns true if conditions were updated. -func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, condition string, reason string, message string) bool { - if condition == "" { - return false +func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, condition string, reason string, message string) { + if condition == "" { // backward (=legacy) compatibility + return } klog.InfoS("updateStatus", "condition", condition, "reason", reason, "message", message) conditions, ok := status.UpdateConditions(instance.Status.Conditions, condition, reason, message) if ok { instance.Status.Conditions = conditions } - return ok } func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, instance *nropv1.NUMAResourcesOperator, reason string, stErr error) (ctrl.Result, error) { message := messageFromError(stErr) - _ = updateStatusConditionsIfNeeded(instance, status.ConditionDegraded, reason, message) + updateStatusConditionsIfNeeded(instance, status.ConditionDegraded, reason, message) // TODO: if we keep being degraded, we likely (= if we don't, it's too implicit) keep sending possibly redundant updates to the apiserver err := r.Client.Status().Update(ctx, instance) @@ -277,20 +276,20 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (ctrl.Result, error) { if done, res, cond, err := r.reconcileResourceAPI(ctx, instance, trees); done { - _ = updateStatusConditionsIfNeeded(instance, cond, reasonFromError(err), messageFromError(err)) + updateStatusConditionsIfNeeded(instance, cond, reasonFromError(err), messageFromError(err)) return res, err } if r.Platform == platform.OpenShift { if done, res, cond, err := r.reconcileResourceMachineConfig(ctx, instance, trees); done { - _ = updateStatusConditionsIfNeeded(instance, cond, reasonFromError(err), messageFromError(err)) + updateStatusConditionsIfNeeded(instance, cond, reasonFromError(err), messageFromError(err)) return res, err } } dsPerMCP, done, res, cond, err := r.reconcileResourceDaemonSet(ctx, instance, trees) if done { - _ = updateStatusConditionsIfNeeded(instance, cond, reasonFromError(err), messageFromError(err)) + updateStatusConditionsIfNeeded(instance, cond, reasonFromError(err), messageFromError(err)) return res, err } @@ -298,7 +297,7 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, // is a certain thing if we got to this point otherwise the function would have returned already instance.Status.NodeGroups = syncNodeGroupsStatus(instance, dsPerMCP) - _ = updateStatusConditionsIfNeeded(instance, status.ConditionAvailable, reasonFromError(nil), messageFromError(nil)) + updateStatusConditionsIfNeeded(instance, status.ConditionAvailable, reasonFromError(nil), messageFromError(nil)) return ctrl.Result{}, nil } From 188c08994f9ed4ab88f4d4fedf5c77f03b24f61a Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Tue, 29 Oct 2024 11:21:18 +0100 Subject: [PATCH 06/10] ctrl: nrop: factor condition info data in a struct instead of passing condition types, possibly message, maybe error, then derive the full condition data in many place, factor all the data in a condition info struct, to be used as basis for creating the real metav1.Condition. This clean up things and unlocks further cleanups. Signed-off-by: Francesco Romani --- controllers/controllers.go | 59 ++++++++++++++++++- .../numaresourcesoperator_controller.go | 52 ++++++++-------- 2 files changed, 87 insertions(+), 24 deletions(-) diff --git a/controllers/controllers.go b/controllers/controllers.go index 4f327790c..172c49c12 100644 --- a/controllers/controllers.go +++ b/controllers/controllers.go @@ -16,7 +16,64 @@ package controllers -import "errors" +import ( + "errors" + + "github.com/openshift-kni/numaresources-operator/pkg/status" +) + +type conditionInfo struct { + Type string // like metav1.Condition.Type + Reason string + Message string +} + +func (ci conditionInfo) WithReason(reason string) conditionInfo { + ret := ci + if ret.Reason == "" { + ret.Reason = reason + } + return ret +} + +func (ci conditionInfo) WithMessage(message string) conditionInfo { + ret := ci + if ret.Message == "" { + ret.Message = message + } + return ret +} + +func availableConditionInfo() conditionInfo { + return conditionInfo{ + Type: status.ConditionAvailable, + Reason: "AsExpected", + } +} + +func progressingConditionInfo() conditionInfo { + return conditionInfo{ + Type: status.ConditionProgressing, + } +} + +func degradedConditionInfoFromError(err error) conditionInfo { + return conditionInfo{ + Type: status.ConditionDegraded, + Message: messageFromError(err), + Reason: reasonFromError(err), + } +} + +func fillConditionInfoFromError(cond conditionInfo, err error) conditionInfo { + if cond.Message == "" { + cond.Message = messageFromError(err) + } + if cond.Reason == "" { + cond.Reason = reasonFromError(err) + } + return cond +} func reasonFromError(err error) string { if err == nil { diff --git a/controllers/numaresourcesoperator_controller.go b/controllers/numaresourcesoperator_controller.go index b367d5a61..cad5332af 100644 --- a/controllers/numaresourcesoperator_controller.go +++ b/controllers/numaresourcesoperator_controller.go @@ -185,21 +185,24 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr } // updateStatusConditionsIfNeeded returns true if conditions were updated. -func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, condition string, reason string, message string) { - if condition == "" { // backward (=legacy) compatibility +func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, cond conditionInfo) { + if cond.Type == "" { // backward (=legacy) compatibility return } - klog.InfoS("updateStatus", "condition", condition, "reason", reason, "message", message) - conditions, ok := status.UpdateConditions(instance.Status.Conditions, condition, reason, message) + klog.InfoS("updateStatus", "condition", cond.Type, "reason", cond.Reason, "message", cond.Message) + conditions, ok := status.UpdateConditions(instance.Status.Conditions, cond.Type, cond.Reason, cond.Message) if ok { instance.Status.Conditions = conditions } } func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, instance *nropv1.NUMAResourcesOperator, reason string, stErr error) (ctrl.Result, error) { - message := messageFromError(stErr) + info := degradedConditionInfoFromError(stErr) + if reason != "" { // intentionally overwrite + info.Reason = reason + } - updateStatusConditionsIfNeeded(instance, status.ConditionDegraded, reason, message) + updateStatusConditionsIfNeeded(instance, info) // TODO: if we keep being degraded, we likely (= if we don't, it's too implicit) keep sending possibly redundant updates to the apiserver err := r.Client.Status().Update(ctx, instance) @@ -213,25 +216,27 @@ func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, ins return ctrl.Result{}, nil } -func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (bool, ctrl.Result, string, error) { +func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (bool, ctrl.Result, conditionInfo, error) { applied, err := r.syncNodeResourceTopologyAPI(ctx) if err != nil { r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedCRDInstall", "Failed to install Node Resource Topology CRD: %v", err) - return true, ctrl.Result{}, status.ConditionDegraded, fmt.Errorf("FailedAPISync: %w", err) + err = fmt.Errorf("FailedAPISync: %w", err) + return true, ctrl.Result{}, degradedConditionInfoFromError(err), err } if applied { r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulCRDInstall", "Node Resource Topology CRD installed") } - return false, ctrl.Result{}, "", nil + return false, ctrl.Result{}, availableConditionInfo(), nil } -func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (bool, ctrl.Result, string, error) { +func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (bool, ctrl.Result, conditionInfo, error) { // we need to sync machine configs first and wait for the MachineConfigPool updates // before checking additional components for updates mcpUpdatedFunc, err := r.syncMachineConfigs(ctx, instance, trees) if err != nil { r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedMCSync", "Failed to set up machine configuration for worker nodes: %v", err) - return true, ctrl.Result{}, status.ConditionDegraded, fmt.Errorf("failed to sync machine configs: %w", err) + err = fmt.Errorf("failed to sync machine configs: %w", err) + return true, ctrl.Result{}, degradedConditionInfoFromError(err), err } r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulMCSync", "Enabled machine configuration for worker nodes") @@ -242,21 +247,22 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx con if !allMCPsUpdated { // the Machine Config Pool still did not apply the machine config, wait for one minute - return true, ctrl.Result{RequeueAfter: numaResourcesRetryPeriod}, status.ConditionProgressing, nil + return true, ctrl.Result{RequeueAfter: numaResourcesRetryPeriod}, progressingConditionInfo(), nil } instance.Status.MachineConfigPools = syncMachineConfigPoolNodeGroupConfigStatuses(instance.Status.MachineConfigPools, trees) - return false, ctrl.Result{}, "", nil + return false, ctrl.Result{}, availableConditionInfo(), nil } -func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) ([]poolDaemonSet, bool, ctrl.Result, string, error) { +func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) ([]poolDaemonSet, bool, ctrl.Result, conditionInfo, error) { daemonSetsInfoPerMCP, err := r.syncNUMAResourcesOperatorResources(ctx, instance, trees) if err != nil { r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedRTECreate", "Failed to create Resource-Topology-Exporter DaemonSets: %v", err) - return nil, true, ctrl.Result{}, status.ConditionDegraded, fmt.Errorf("FailedRTESync: %w", err) + err = fmt.Errorf("FailedRTESync: %w", err) + return nil, true, ctrl.Result{}, degradedConditionInfoFromError(err), err } if len(daemonSetsInfoPerMCP) == 0 { - return nil, false, ctrl.Result{}, "", nil + return nil, false, ctrl.Result{}, availableConditionInfo(), nil } r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulRTECreate", "Created Resource-Topology-Exporter DaemonSets") @@ -265,31 +271,31 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context instance.Status.DaemonSets = dssWithReadyStatus instance.Status.RelatedObjects = relatedobjects.ResourceTopologyExporter(r.Namespace, dssWithReadyStatus) if err != nil { - return nil, true, ctrl.Result{}, status.ConditionDegraded, err + return nil, true, ctrl.Result{}, degradedConditionInfoFromError(err), err } if !allDSsUpdated { - return nil, true, ctrl.Result{RequeueAfter: 5 * time.Second}, status.ConditionProgressing, nil + return nil, true, ctrl.Result{RequeueAfter: 5 * time.Second}, progressingConditionInfo(), nil } - return daemonSetsInfoPerMCP, false, ctrl.Result{}, "", nil + return daemonSetsInfoPerMCP, false, ctrl.Result{}, availableConditionInfo(), nil } func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (ctrl.Result, error) { if done, res, cond, err := r.reconcileResourceAPI(ctx, instance, trees); done { - updateStatusConditionsIfNeeded(instance, cond, reasonFromError(err), messageFromError(err)) + updateStatusConditionsIfNeeded(instance, fillConditionInfoFromError(cond, err)) return res, err } if r.Platform == platform.OpenShift { if done, res, cond, err := r.reconcileResourceMachineConfig(ctx, instance, trees); done { - updateStatusConditionsIfNeeded(instance, cond, reasonFromError(err), messageFromError(err)) + updateStatusConditionsIfNeeded(instance, fillConditionInfoFromError(cond, err)) return res, err } } dsPerMCP, done, res, cond, err := r.reconcileResourceDaemonSet(ctx, instance, trees) if done { - updateStatusConditionsIfNeeded(instance, cond, reasonFromError(err), messageFromError(err)) + updateStatusConditionsIfNeeded(instance, fillConditionInfoFromError(cond, err)) return res, err } @@ -297,7 +303,7 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, // is a certain thing if we got to this point otherwise the function would have returned already instance.Status.NodeGroups = syncNodeGroupsStatus(instance, dsPerMCP) - updateStatusConditionsIfNeeded(instance, status.ConditionAvailable, reasonFromError(nil), messageFromError(nil)) + updateStatusConditionsIfNeeded(instance, availableConditionInfo()) return ctrl.Result{}, nil } From 1e2a67739782dffd5538c729dc98ad35bb69feff Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Tue, 19 Nov 2024 09:41:26 +0100 Subject: [PATCH 07/10] ctrl: nrop: pack the common return args in a struct the reconciliation steps are returning a common (and growing) set of values, let's pack them in a struct, since we always want to return the same tuple anyway for consistency. Signed-off-by: Francesco Romani --- controllers/controllers.go | 72 ++++++++++++++++--- .../numaresourcesoperator_controller.go | 63 ++++++++-------- .../numaresourcesoperator_controller_test.go | 3 +- 3 files changed, 97 insertions(+), 41 deletions(-) diff --git a/controllers/controllers.go b/controllers/controllers.go index 172c49c12..4c649ddd8 100644 --- a/controllers/controllers.go +++ b/controllers/controllers.go @@ -18,16 +18,70 @@ package controllers import ( "errors" + "time" + + ctrl "sigs.k8s.io/controller-runtime" "github.com/openshift-kni/numaresources-operator/pkg/status" ) +type reconcileStep struct { + Result ctrl.Result + ConditionInfo conditionInfo + Error error +} + +// Done returns true if a reconciliation step is fully completed, +// meaning all the substeps have been performed successfully, false otherwise +func (rs reconcileStep) Done() bool { + return rs.ConditionInfo.Type == status.ConditionAvailable +} + +// EarlyStop returns true if we should shortcut after a reconcile substep, +// false otherwise +func (rs reconcileStep) EarlyStop() bool { + return rs.ConditionInfo.Type != status.ConditionAvailable +} + +// reconcileStepSuccess returns a reconcileStep which tells the caller +// the reconciliation attempt was completed successfully +func reconcileStepSuccess() reconcileStep { + return reconcileStep{ + Result: ctrl.Result{}, + ConditionInfo: availableConditionInfo(), + } +} + +// reconcileStepOngoing returns a reconcileStep which tells the caller +// the reconciliation attempt needs to wait for external condition (e.g. +// MachineConfig updates and to retry to reconcile after the `after` value. +func reconcileStepOngoing(after time.Duration) reconcileStep { + return reconcileStep{ + Result: ctrl.Result{RequeueAfter: after}, + ConditionInfo: progressingConditionInfo(), + } +} + +// reconcileStepFailed returns a reconcileStep which tells the caller +// the reconciliation attempt failed with the given error `err`, and +// that the NUMAResourcesOperator condition should be Degraded. +func reconcileStepFailed(err error) reconcileStep { + return reconcileStep{ + Result: ctrl.Result{}, + ConditionInfo: degradedConditionInfoFromError(err), + Error: err, + } +} + +// conditionInfo holds all the data needed to build a Condition type conditionInfo struct { Type string // like metav1.Condition.Type Reason string Message string } +// WithReason override the conditionInfo reason with the given value, +// and returns a new updated conditionInfo func (ci conditionInfo) WithReason(reason string) conditionInfo { ret := ci if ret.Reason == "" { @@ -36,6 +90,8 @@ func (ci conditionInfo) WithReason(reason string) conditionInfo { return ret } +// WithMessage override the conditionInfo message with the given value, +// and returns a new updated conditionInfo func (ci conditionInfo) WithMessage(message string) conditionInfo { ret := ci if ret.Message == "" { @@ -44,6 +100,8 @@ func (ci conditionInfo) WithMessage(message string) conditionInfo { return ret } +// availableConditionInfo returns a conditionInfo ready to build +// an Available condition func availableConditionInfo() conditionInfo { return conditionInfo{ Type: status.ConditionAvailable, @@ -51,12 +109,16 @@ func availableConditionInfo() conditionInfo { } } +// progressingConditionInfo returns a conditionInfo ready to build +// an Progressing condition func progressingConditionInfo() conditionInfo { return conditionInfo{ Type: status.ConditionProgressing, } } +// degradedConditionInfo returns a conditionInfo ready to build +// an Degraded condition with the given error `err`. func degradedConditionInfoFromError(err error) conditionInfo { return conditionInfo{ Type: status.ConditionDegraded, @@ -65,16 +127,6 @@ func degradedConditionInfoFromError(err error) conditionInfo { } } -func fillConditionInfoFromError(cond conditionInfo, err error) conditionInfo { - if cond.Message == "" { - cond.Message = messageFromError(err) - } - if cond.Reason == "" { - cond.Reason = reasonFromError(err) - } - return cond -} - func reasonFromError(err error) string { if err == nil { return "AsExpected" diff --git a/controllers/numaresourcesoperator_controller.go b/controllers/numaresourcesoperator_controller.go index cad5332af..c663f519f 100644 --- a/controllers/numaresourcesoperator_controller.go +++ b/controllers/numaresourcesoperator_controller.go @@ -165,14 +165,14 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr curStatus := instance.Status.DeepCopy() - result, err := r.reconcileResource(ctx, instance, trees) + step := r.reconcileResource(ctx, instance, trees) - if condition == status.ConditionAvailable && multiMCPsErr != nil { - _, _ = r.degradeStatus(ctx, instance, validation.NodeGroupsError, multiMCPsErr) + if step.Done() && multiMCPsErr != nil { + return r.degradeStatus(ctx, instance, validation.NodeGroupsError, multiMCPsErr) } if !status.IsUpdatedNUMAResourcesOperator(curStatus, &instance.Status) { - return result, err + return step.Result, step.Error } updErr := r.Client.Status().Update(ctx, instance) @@ -181,7 +181,7 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), updErr) } - return result, err + return step.Result, step.Error } // updateStatusConditionsIfNeeded returns true if conditions were updated. @@ -216,27 +216,27 @@ func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, ins return ctrl.Result{}, nil } -func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (bool, ctrl.Result, conditionInfo, error) { +func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) reconcileStep { applied, err := r.syncNodeResourceTopologyAPI(ctx) if err != nil { r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedCRDInstall", "Failed to install Node Resource Topology CRD: %v", err) err = fmt.Errorf("FailedAPISync: %w", err) - return true, ctrl.Result{}, degradedConditionInfoFromError(err), err + return reconcileStepFailed(err) } if applied { r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulCRDInstall", "Node Resource Topology CRD installed") } - return false, ctrl.Result{}, availableConditionInfo(), nil + return reconcileStepSuccess() } -func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (bool, ctrl.Result, conditionInfo, error) { +func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) reconcileStep { // we need to sync machine configs first and wait for the MachineConfigPool updates // before checking additional components for updates mcpUpdatedFunc, err := r.syncMachineConfigs(ctx, instance, trees) if err != nil { r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedMCSync", "Failed to set up machine configuration for worker nodes: %v", err) err = fmt.Errorf("failed to sync machine configs: %w", err) - return true, ctrl.Result{}, degradedConditionInfoFromError(err), err + return reconcileStepFailed(err) } r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulMCSync", "Enabled machine configuration for worker nodes") @@ -247,22 +247,22 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx con if !allMCPsUpdated { // the Machine Config Pool still did not apply the machine config, wait for one minute - return true, ctrl.Result{RequeueAfter: numaResourcesRetryPeriod}, progressingConditionInfo(), nil + return reconcileStepOngoing(numaResourcesRetryPeriod) } instance.Status.MachineConfigPools = syncMachineConfigPoolNodeGroupConfigStatuses(instance.Status.MachineConfigPools, trees) - return false, ctrl.Result{}, availableConditionInfo(), nil + return reconcileStepSuccess() } -func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) ([]poolDaemonSet, bool, ctrl.Result, conditionInfo, error) { +func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) ([]poolDaemonSet, reconcileStep) { daemonSetsInfoPerMCP, err := r.syncNUMAResourcesOperatorResources(ctx, instance, trees) if err != nil { r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedRTECreate", "Failed to create Resource-Topology-Exporter DaemonSets: %v", err) err = fmt.Errorf("FailedRTESync: %w", err) - return nil, true, ctrl.Result{}, degradedConditionInfoFromError(err), err + return nil, reconcileStepFailed(err) } if len(daemonSetsInfoPerMCP) == 0 { - return nil, false, ctrl.Result{}, availableConditionInfo(), nil + return nil, reconcileStepSuccess() } r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulRTECreate", "Created Resource-Topology-Exporter DaemonSets") @@ -271,32 +271,32 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context instance.Status.DaemonSets = dssWithReadyStatus instance.Status.RelatedObjects = relatedobjects.ResourceTopologyExporter(r.Namespace, dssWithReadyStatus) if err != nil { - return nil, true, ctrl.Result{}, degradedConditionInfoFromError(err), err + return nil, reconcileStepFailed(err) } if !allDSsUpdated { - return nil, true, ctrl.Result{RequeueAfter: 5 * time.Second}, progressingConditionInfo(), nil + return nil, reconcileStepOngoing(5 * time.Second) } - return daemonSetsInfoPerMCP, false, ctrl.Result{}, availableConditionInfo(), nil + return daemonSetsInfoPerMCP, reconcileStepSuccess() } -func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (ctrl.Result, error) { - if done, res, cond, err := r.reconcileResourceAPI(ctx, instance, trees); done { - updateStatusConditionsIfNeeded(instance, fillConditionInfoFromError(cond, err)) - return res, err +func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) reconcileStep { + if step := r.reconcileResourceAPI(ctx, instance, trees); step.EarlyStop() { + updateStatusConditionsIfNeeded(instance, step.ConditionInfo) + return step } if r.Platform == platform.OpenShift { - if done, res, cond, err := r.reconcileResourceMachineConfig(ctx, instance, trees); done { - updateStatusConditionsIfNeeded(instance, fillConditionInfoFromError(cond, err)) - return res, err + if step := r.reconcileResourceMachineConfig(ctx, instance, trees); step.EarlyStop() { + updateStatusConditionsIfNeeded(instance, step.ConditionInfo) + return step } } - dsPerMCP, done, res, cond, err := r.reconcileResourceDaemonSet(ctx, instance, trees) - if done { - updateStatusConditionsIfNeeded(instance, fillConditionInfoFromError(cond, err)) - return res, err + dsPerMCP, step := r.reconcileResourceDaemonSet(ctx, instance, trees) + if step.EarlyStop() { + updateStatusConditionsIfNeeded(instance, step.ConditionInfo) + return step } // all fields of NodeGroupStatus are required so publish the status only when all daemonset and MCPs are updated which @@ -304,7 +304,10 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance.Status.NodeGroups = syncNodeGroupsStatus(instance, dsPerMCP) updateStatusConditionsIfNeeded(instance, availableConditionInfo()) - return ctrl.Result{}, nil + return reconcileStep{ + Result: ctrl.Result{}, + ConditionInfo: availableConditionInfo(), + } } func (r *NUMAResourcesOperatorReconciler) syncDaemonSetsStatuses(ctx context.Context, rd client.Reader, daemonSetsInfo []poolDaemonSet) ([]nropv1.NamespacedName, bool, error) { diff --git a/controllers/numaresourcesoperator_controller_test.go b/controllers/numaresourcesoperator_controller_test.go index 41812854d..eb8ed6e9f 100644 --- a/controllers/numaresourcesoperator_controller_test.go +++ b/controllers/numaresourcesoperator_controller_test.go @@ -188,7 +188,7 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() { }) Context("with node group with MCP selector that matches more than one MCP", func() { - It("should update the CR condition to degraded when annotation is not enabled but still creat all needed objects", func() { + It("should update the CR condition to degraded when annotation is not enabled but still create all needed objects", func() { mcpName1 := "test1" label1 := map[string]string{ "test1": "test1", @@ -336,6 +336,7 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() { Expect(availableCondition.Status).To(Equal(metav1.ConditionTrue)) }) }) + Context("with correct NRO and SELinuxPolicyConfigAnnotation not set", func() { It("should create all objects, RTE daemonsets and MCPs will get updated from the first reconcile iteration", func() { mcpName := "test1" From 84a9751ed24fe9b3616dd81eb08f5ac8f34d8dfa Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Tue, 19 Nov 2024 11:25:37 +0100 Subject: [PATCH 08/10] status: controllers: move utilities in most fitting places Instead of adding utilities in `controllers`, which makes harder than it can to add unit tests, move the utilities in the existing or new more fitting packages. Trivial code movement with related trivial rename, and enables adding unit tests Signed-off-by: Francesco Romani --- controllers/controllers.go | 146 ------------------ .../numaresourcesoperator_controller.go | 41 ++--- .../numaresourcesscheduler_controller.go | 2 +- internal/reconcile/step.go | 74 +++++++++ pkg/status/conditioninfo/conditioninfo.go | 75 +++++++++ pkg/status/status.go | 25 +++ 6 files changed, 197 insertions(+), 166 deletions(-) delete mode 100644 controllers/controllers.go create mode 100644 internal/reconcile/step.go create mode 100644 pkg/status/conditioninfo/conditioninfo.go diff --git a/controllers/controllers.go b/controllers/controllers.go deleted file mode 100644 index 4c649ddd8..000000000 --- a/controllers/controllers.go +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright 2021 Red Hat, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package controllers - -import ( - "errors" - "time" - - ctrl "sigs.k8s.io/controller-runtime" - - "github.com/openshift-kni/numaresources-operator/pkg/status" -) - -type reconcileStep struct { - Result ctrl.Result - ConditionInfo conditionInfo - Error error -} - -// Done returns true if a reconciliation step is fully completed, -// meaning all the substeps have been performed successfully, false otherwise -func (rs reconcileStep) Done() bool { - return rs.ConditionInfo.Type == status.ConditionAvailable -} - -// EarlyStop returns true if we should shortcut after a reconcile substep, -// false otherwise -func (rs reconcileStep) EarlyStop() bool { - return rs.ConditionInfo.Type != status.ConditionAvailable -} - -// reconcileStepSuccess returns a reconcileStep which tells the caller -// the reconciliation attempt was completed successfully -func reconcileStepSuccess() reconcileStep { - return reconcileStep{ - Result: ctrl.Result{}, - ConditionInfo: availableConditionInfo(), - } -} - -// reconcileStepOngoing returns a reconcileStep which tells the caller -// the reconciliation attempt needs to wait for external condition (e.g. -// MachineConfig updates and to retry to reconcile after the `after` value. -func reconcileStepOngoing(after time.Duration) reconcileStep { - return reconcileStep{ - Result: ctrl.Result{RequeueAfter: after}, - ConditionInfo: progressingConditionInfo(), - } -} - -// reconcileStepFailed returns a reconcileStep which tells the caller -// the reconciliation attempt failed with the given error `err`, and -// that the NUMAResourcesOperator condition should be Degraded. -func reconcileStepFailed(err error) reconcileStep { - return reconcileStep{ - Result: ctrl.Result{}, - ConditionInfo: degradedConditionInfoFromError(err), - Error: err, - } -} - -// conditionInfo holds all the data needed to build a Condition -type conditionInfo struct { - Type string // like metav1.Condition.Type - Reason string - Message string -} - -// WithReason override the conditionInfo reason with the given value, -// and returns a new updated conditionInfo -func (ci conditionInfo) WithReason(reason string) conditionInfo { - ret := ci - if ret.Reason == "" { - ret.Reason = reason - } - return ret -} - -// WithMessage override the conditionInfo message with the given value, -// and returns a new updated conditionInfo -func (ci conditionInfo) WithMessage(message string) conditionInfo { - ret := ci - if ret.Message == "" { - ret.Message = message - } - return ret -} - -// availableConditionInfo returns a conditionInfo ready to build -// an Available condition -func availableConditionInfo() conditionInfo { - return conditionInfo{ - Type: status.ConditionAvailable, - Reason: "AsExpected", - } -} - -// progressingConditionInfo returns a conditionInfo ready to build -// an Progressing condition -func progressingConditionInfo() conditionInfo { - return conditionInfo{ - Type: status.ConditionProgressing, - } -} - -// degradedConditionInfo returns a conditionInfo ready to build -// an Degraded condition with the given error `err`. -func degradedConditionInfoFromError(err error) conditionInfo { - return conditionInfo{ - Type: status.ConditionDegraded, - Message: messageFromError(err), - Reason: reasonFromError(err), - } -} - -func reasonFromError(err error) string { - if err == nil { - return "AsExpected" - } - return "InternalError" -} - -func messageFromError(err error) string { - if err == nil { - return "" - } - unwErr := errors.Unwrap(err) - if unwErr == nil { - return "" - } - return unwErr.Error() -} diff --git a/controllers/numaresourcesoperator_controller.go b/controllers/numaresourcesoperator_controller.go index c663f519f..31ac4228b 100644 --- a/controllers/numaresourcesoperator_controller.go +++ b/controllers/numaresourcesoperator_controller.go @@ -66,7 +66,10 @@ import ( rtestate "github.com/openshift-kni/numaresources-operator/pkg/objectstate/rte" rteupdate "github.com/openshift-kni/numaresources-operator/pkg/objectupdate/rte" "github.com/openshift-kni/numaresources-operator/pkg/status" + "github.com/openshift-kni/numaresources-operator/pkg/status/conditioninfo" "github.com/openshift-kni/numaresources-operator/pkg/validation" + + intreconcile "github.com/openshift-kni/numaresources-operator/internal/reconcile" ) const numaResourcesRetryPeriod = 1 * time.Minute @@ -185,7 +188,7 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr } // updateStatusConditionsIfNeeded returns true if conditions were updated. -func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, cond conditionInfo) { +func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, cond conditioninfo.ConditionInfo) { if cond.Type == "" { // backward (=legacy) compatibility return } @@ -197,7 +200,7 @@ func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, cond } func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, instance *nropv1.NUMAResourcesOperator, reason string, stErr error) (ctrl.Result, error) { - info := degradedConditionInfoFromError(stErr) + info := conditioninfo.DegradedFromError(stErr) if reason != "" { // intentionally overwrite info.Reason = reason } @@ -216,27 +219,27 @@ func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, ins return ctrl.Result{}, nil } -func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) reconcileStep { +func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step { applied, err := r.syncNodeResourceTopologyAPI(ctx) if err != nil { r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedCRDInstall", "Failed to install Node Resource Topology CRD: %v", err) err = fmt.Errorf("FailedAPISync: %w", err) - return reconcileStepFailed(err) + return intreconcile.StepFailed(err) } if applied { r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulCRDInstall", "Node Resource Topology CRD installed") } - return reconcileStepSuccess() + return intreconcile.StepSuccess() } -func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) reconcileStep { +func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step { // we need to sync machine configs first and wait for the MachineConfigPool updates // before checking additional components for updates mcpUpdatedFunc, err := r.syncMachineConfigs(ctx, instance, trees) if err != nil { r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedMCSync", "Failed to set up machine configuration for worker nodes: %v", err) err = fmt.Errorf("failed to sync machine configs: %w", err) - return reconcileStepFailed(err) + return intreconcile.StepFailed(err) } r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulMCSync", "Enabled machine configuration for worker nodes") @@ -247,22 +250,22 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx con if !allMCPsUpdated { // the Machine Config Pool still did not apply the machine config, wait for one minute - return reconcileStepOngoing(numaResourcesRetryPeriod) + return intreconcile.StepOngoing(numaResourcesRetryPeriod) } instance.Status.MachineConfigPools = syncMachineConfigPoolNodeGroupConfigStatuses(instance.Status.MachineConfigPools, trees) - return reconcileStepSuccess() + return intreconcile.StepSuccess() } -func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) ([]poolDaemonSet, reconcileStep) { +func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) ([]poolDaemonSet, intreconcile.Step) { daemonSetsInfoPerMCP, err := r.syncNUMAResourcesOperatorResources(ctx, instance, trees) if err != nil { r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedRTECreate", "Failed to create Resource-Topology-Exporter DaemonSets: %v", err) err = fmt.Errorf("FailedRTESync: %w", err) - return nil, reconcileStepFailed(err) + return nil, intreconcile.StepFailed(err) } if len(daemonSetsInfoPerMCP) == 0 { - return nil, reconcileStepSuccess() + return nil, intreconcile.StepSuccess() } r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulRTECreate", "Created Resource-Topology-Exporter DaemonSets") @@ -271,16 +274,16 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context instance.Status.DaemonSets = dssWithReadyStatus instance.Status.RelatedObjects = relatedobjects.ResourceTopologyExporter(r.Namespace, dssWithReadyStatus) if err != nil { - return nil, reconcileStepFailed(err) + return nil, intreconcile.StepFailed(err) } if !allDSsUpdated { - return nil, reconcileStepOngoing(5 * time.Second) + return nil, intreconcile.StepOngoing(5 * time.Second) } - return daemonSetsInfoPerMCP, reconcileStepSuccess() + return daemonSetsInfoPerMCP, intreconcile.StepSuccess() } -func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) reconcileStep { +func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step { if step := r.reconcileResourceAPI(ctx, instance, trees); step.EarlyStop() { updateStatusConditionsIfNeeded(instance, step.ConditionInfo) return step @@ -303,10 +306,10 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, // is a certain thing if we got to this point otherwise the function would have returned already instance.Status.NodeGroups = syncNodeGroupsStatus(instance, dsPerMCP) - updateStatusConditionsIfNeeded(instance, availableConditionInfo()) - return reconcileStep{ + updateStatusConditionsIfNeeded(instance, conditioninfo.Available()) + return intreconcile.Step{ Result: ctrl.Result{}, - ConditionInfo: availableConditionInfo(), + ConditionInfo: conditioninfo.Available(), } } diff --git a/controllers/numaresourcesscheduler_controller.go b/controllers/numaresourcesscheduler_controller.go index 14d2bebd3..df53d43c7 100644 --- a/controllers/numaresourcesscheduler_controller.go +++ b/controllers/numaresourcesscheduler_controller.go @@ -111,7 +111,7 @@ func (r *NUMAResourcesSchedulerReconciler) Reconcile(ctx context.Context, req ct } result, condition, err := r.reconcileResource(ctx, instance) - if err := r.updateStatus(ctx, instance, condition, reasonFromError(err), messageFromError(err)); err != nil { + if err := r.updateStatus(ctx, instance, condition, status.ReasonFromError(err), status.MessageFromError(err)); err != nil { klog.InfoS("Failed to update numaresourcesscheduler status", "Desired condition", condition, "error", err) } diff --git a/internal/reconcile/step.go b/internal/reconcile/step.go new file mode 100644 index 000000000..65fa60862 --- /dev/null +++ b/internal/reconcile/step.go @@ -0,0 +1,74 @@ +/* + * Copyright 2024 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package reconcile + +import ( + "time" + + ctrl "sigs.k8s.io/controller-runtime" + + "github.com/openshift-kni/numaresources-operator/pkg/status" + "github.com/openshift-kni/numaresources-operator/pkg/status/conditioninfo" +) + +type Step struct { + Result ctrl.Result + ConditionInfo conditioninfo.ConditionInfo + Error error +} + +// Done returns true if a reconciliation step is fully completed, +// meaning all the substeps have been performed successfully, false otherwise +func (rs Step) Done() bool { + return rs.ConditionInfo.Type == status.ConditionAvailable +} + +// EarlyStop returns true if we should shortcut after a reconcile substep, +// false otherwise +func (rs Step) EarlyStop() bool { + return rs.ConditionInfo.Type != status.ConditionAvailable +} + +// StepSuccess returns a Step which tells the caller +// the reconciliation attempt was completed successfully +func StepSuccess() Step { + return Step{ + Result: ctrl.Result{}, + ConditionInfo: conditioninfo.Available(), + } +} + +// StepOngoing returns a Step which tells the caller +// the reconciliation attempt needs to wait for external condition (e.g. +// MachineConfig updates and to retry to reconcile after the `after` value. +func StepOngoing(after time.Duration) Step { + return Step{ + Result: ctrl.Result{RequeueAfter: after}, + ConditionInfo: conditioninfo.Progressing(), + } +} + +// StepFailed returns a Step which tells the caller +// the reconciliation attempt failed with the given error `err`, and +// that the NUMAResourcesOperator condition should be Degraded. +func StepFailed(err error) Step { + return Step{ + Result: ctrl.Result{}, + ConditionInfo: conditioninfo.DegradedFromError(err), + Error: err, + } +} diff --git a/pkg/status/conditioninfo/conditioninfo.go b/pkg/status/conditioninfo/conditioninfo.go new file mode 100644 index 000000000..cd0d85fae --- /dev/null +++ b/pkg/status/conditioninfo/conditioninfo.go @@ -0,0 +1,75 @@ +/* + * Copyright 2024 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package conditioninfo + +import ( + "github.com/openshift-kni/numaresources-operator/pkg/status" +) + +// ConditionInfo holds all the data needed to build a Condition +type ConditionInfo struct { + Type string // like metav1.Condition.Type + Reason string + Message string +} + +// WithReason override the ConditionInfo reason with the given value, +// and returns a new updated ConditionInfo +func (ci ConditionInfo) WithReason(reason string) ConditionInfo { + ret := ci + if ret.Reason == "" { + ret.Reason = reason + } + return ret +} + +// WithMessage override the ConditionInfo message with the given value, +// and returns a new updated ConditionInfo +func (ci ConditionInfo) WithMessage(message string) ConditionInfo { + ret := ci + if ret.Message == "" { + ret.Message = message + } + return ret +} + +// Available returns a ConditionInfo ready to build +// an Available condition +func Available() ConditionInfo { + return ConditionInfo{ + Type: status.ConditionAvailable, + Reason: status.ReasonAsExpected, + } +} + +// Progressing returns a ConditionInfo ready to build +// an Progressing condition +func Progressing() ConditionInfo { + return ConditionInfo{ + Type: status.ConditionProgressing, + } +} + +// Degraded returns a ConditionInfo ready to build +// an Degraded condition with the given error `err`. +func DegradedFromError(err error) ConditionInfo { + return ConditionInfo{ + Type: status.ConditionDegraded, + Message: status.MessageFromError(err), + Reason: status.ReasonFromError(err), + } +} diff --git a/pkg/status/status.go b/pkg/status/status.go index 3ac16f359..a56004c4a 100644 --- a/pkg/status/status.go +++ b/pkg/status/status.go @@ -17,6 +17,7 @@ limitations under the License. package status import ( + "errors" "time" "github.com/google/go-cmp/cmp" @@ -35,6 +36,12 @@ const ( ConditionUpgradeable = "Upgradeable" ) +// TODO: are we duping these? +const ( + ReasonAsExpected = "AsExpected" + ReasonInternalError = "InternalError" +) + const ( ConditionTypeIncorrectNUMAResourcesOperatorResourceName = "IncorrectNUMAResourcesOperatorResourceName" ) @@ -130,3 +137,21 @@ type ErrResourcesNotReady struct { func (e ErrResourcesNotReady) Error() string { return e.Message } + +func ReasonFromError(err error) string { + if err == nil { + return ReasonAsExpected + } + return ReasonInternalError +} + +func MessageFromError(err error) string { + if err == nil { + return "" + } + unwErr := errors.Unwrap(err) + if unwErr == nil { + return "" + } + return unwErr.Error() +} From 6e5875d7b01555b75252350fad5ccc0e8d62acfd Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Tue, 19 Nov 2024 11:27:55 +0100 Subject: [PATCH 09/10] status: always get message from errors if we fed status.MessageFromError with a simple, unwrapping error, we mistakenly get a empty string as message, instead of the text representation of the given error. This is both surprising and wrong. This patch fixes that. Signed-off-by: Francesco Romani --- pkg/status/status.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/status/status.go b/pkg/status/status.go index a56004c4a..37637d268 100644 --- a/pkg/status/status.go +++ b/pkg/status/status.go @@ -151,7 +151,7 @@ func MessageFromError(err error) string { } unwErr := errors.Unwrap(err) if unwErr == nil { - return "" + return err.Error() } return unwErr.Error() } From df43a586eae2e09f3c057b7ccd3507035e671b4f Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Tue, 19 Nov 2024 11:28:55 +0100 Subject: [PATCH 10/10] stauts: add MessageFromError coverage start testcase coverage for the newly moved functions Signed-off-by: Francesco Romani --- pkg/status/status_test.go | 42 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/pkg/status/status_test.go b/pkg/status/status_test.go index 2da592168..ff750a023 100644 --- a/pkg/status/status_test.go +++ b/pkg/status/status_test.go @@ -18,6 +18,8 @@ package status import ( "context" + "errors" + "fmt" "testing" "time" @@ -159,3 +161,43 @@ func TestIsUpdatedNUMAResourcesOperator(t *testing.T) { }) } } + +func TestMessageFromError(t *testing.T) { + type testCase struct { + name string + err error + expected string + } + + testCases := []testCase{ + { + name: "nil error", + expected: "", + }, + { + name: "simple error", + err: errors.New("testing error with simple message"), + expected: "testing error with simple message", + }, + { + name: "simple formatted error", + err: fmt.Errorf("testing error message=%s val=%d", "complex", 42), + expected: "testing error message=complex val=42", + }, + + { + name: "wrapped error", + err: fmt.Errorf("outer error: %w", errors.New("inner error")), + expected: "inner error", + }, + } + + for _, tcase := range testCases { + t.Run(tcase.name, func(t *testing.T) { + got := MessageFromError(tcase.err) + if got != tcase.expected { + t.Errorf("failure getting message from error: got=%q expected=%q", got, tcase.expected) + } + }) + } +}