Skip to content

Commit

Permalink
Merge pull request #1032 from openshift-kni/cleanup-status-computation
Browse files Browse the repository at this point in the history
cleanup and streamline status computation
  • Loading branch information
ffromani authored Nov 20, 2024
2 parents 15c8a8e + df43a58 commit 8d3b2ad
Show file tree
Hide file tree
Showing 8 changed files with 381 additions and 80 deletions.
37 changes: 0 additions & 37 deletions controllers/controllers.go

This file was deleted.

111 changes: 70 additions & 41 deletions controllers/numaresourcesoperator_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ import (
rtestate "github.com/openshift-kni/numaresources-operator/pkg/objectstate/rte"
rteupdate "github.com/openshift-kni/numaresources-operator/pkg/objectupdate/rte"
"github.com/openshift-kni/numaresources-operator/pkg/status"
"github.com/openshift-kni/numaresources-operator/pkg/status/conditioninfo"
"github.com/openshift-kni/numaresources-operator/pkg/validation"

intreconcile "github.com/openshift-kni/numaresources-operator/internal/reconcile"
)

const numaResourcesRetryPeriod = 1 * time.Minute
Expand Down Expand Up @@ -140,54 +143,70 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr

if req.Name != objectnames.DefaultNUMAResourcesOperatorCrName {
err := fmt.Errorf("incorrect NUMAResourcesOperator resource name: %s", instance.Name)
return r.updateStatus(ctx, instance, status.ConditionDegraded, status.ConditionTypeIncorrectNUMAResourcesOperatorResourceName, err)
return r.degradeStatus(ctx, instance, status.ConditionTypeIncorrectNUMAResourcesOperatorResourceName, err)
}

if err := validation.NodeGroups(instance.Spec.NodeGroups); err != nil {
return r.updateStatus(ctx, instance, status.ConditionDegraded, validation.NodeGroupsError, err)
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
}

trees, err := getTreesByNodeGroup(ctx, r.Client, instance.Spec.NodeGroups)
if err != nil {
return r.updateStatus(ctx, instance, status.ConditionDegraded, validation.NodeGroupsError, err)
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
}

multiMCPsErr := validation.MultipleMCPsPerTree(instance.Annotations, trees)

if err := validation.MachineConfigPoolDuplicates(trees); err != nil {
return r.updateStatus(ctx, instance, status.ConditionDegraded, validation.NodeGroupsError, err)
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
}

for idx := range trees {
conf := trees[idx].NodeGroup.NormalizeConfig()
trees[idx].NodeGroup.Config = &conf
}

result, condition, err := r.reconcileResource(ctx, instance, trees)
if condition != "" {
if condition == status.ConditionAvailable && multiMCPsErr != nil {
_, _ = r.updateStatus(ctx, instance, status.ConditionDegraded, validation.NodeGroupsError, multiMCPsErr)
} else {
_, _ = r.updateStatus(ctx, instance, condition, reasonFromError(err), err)
}
curStatus := instance.Status.DeepCopy()

step := r.reconcileResource(ctx, instance, trees)

if step.Done() && multiMCPsErr != nil {
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, multiMCPsErr)
}

if !status.IsUpdatedNUMAResourcesOperator(curStatus, &instance.Status) {
return step.Result, step.Error
}
return result, err

updErr := r.Client.Status().Update(ctx, instance)
if updErr != nil {
klog.InfoS("Failed to update numaresourcesoperator status", "error", updErr)
return ctrl.Result{}, fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), updErr)
}

return step.Result, step.Error
}

// updateStatusConditionsIfNeeded returns true if conditions were updated.
func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, condition string, reason string, message string) bool {
klog.InfoS("updateStatus", "condition", condition, "reason", reason, "message", message)
conditions, ok := status.UpdateConditions(instance.Status.Conditions, condition, reason, message)
func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, cond conditioninfo.ConditionInfo) {
if cond.Type == "" { // backward (=legacy) compatibility
return
}
klog.InfoS("updateStatus", "condition", cond.Type, "reason", cond.Reason, "message", cond.Message)
conditions, ok := status.UpdateConditions(instance.Status.Conditions, cond.Type, cond.Reason, cond.Message)
if ok {
instance.Status.Conditions = conditions
}
return ok
}

func (r *NUMAResourcesOperatorReconciler) updateStatus(ctx context.Context, instance *nropv1.NUMAResourcesOperator, condition string, reason string, stErr error) (ctrl.Result, error) {
message := messageFromError(stErr)
func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, instance *nropv1.NUMAResourcesOperator, reason string, stErr error) (ctrl.Result, error) {
info := conditioninfo.DegradedFromError(stErr)
if reason != "" { // intentionally overwrite
info.Reason = reason
}

_ = updateStatusConditionsIfNeeded(instance, condition, reason, message)
updateStatusConditionsIfNeeded(instance, info)
// TODO: if we keep being degraded, we likely (= if we don't, it's too implicit) keep sending possibly redundant updates to the apiserver

err := r.Client.Status().Update(ctx, instance)
if err != nil {
Expand All @@ -200,25 +219,27 @@ func (r *NUMAResourcesOperatorReconciler) updateStatus(ctx context.Context, inst
return ctrl.Result{}, nil
}

func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (bool, ctrl.Result, string, error) {
func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step {
applied, err := r.syncNodeResourceTopologyAPI(ctx)
if err != nil {
r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedCRDInstall", "Failed to install Node Resource Topology CRD: %v", err)
return true, ctrl.Result{}, status.ConditionDegraded, fmt.Errorf("FailedAPISync: %w", err)
err = fmt.Errorf("FailedAPISync: %w", err)
return intreconcile.StepFailed(err)
}
if applied {
r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulCRDInstall", "Node Resource Topology CRD installed")
}
return false, ctrl.Result{}, "", nil
return intreconcile.StepSuccess()
}

func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (bool, ctrl.Result, string, error) {
func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step {
// we need to sync machine configs first and wait for the MachineConfigPool updates
// before checking additional components for updates
mcpUpdatedFunc, err := r.syncMachineConfigs(ctx, instance, trees)
if err != nil {
r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedMCSync", "Failed to set up machine configuration for worker nodes: %v", err)
return true, ctrl.Result{}, status.ConditionDegraded, fmt.Errorf("failed to sync machine configs: %w", err)
err = fmt.Errorf("failed to sync machine configs: %w", err)
return intreconcile.StepFailed(err)
}
r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulMCSync", "Enabled machine configuration for worker nodes")

Expand All @@ -229,21 +250,22 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx con

if !allMCPsUpdated {
// the Machine Config Pool still did not apply the machine config, wait for one minute
return true, ctrl.Result{RequeueAfter: numaResourcesRetryPeriod}, status.ConditionProgressing, nil
return intreconcile.StepOngoing(numaResourcesRetryPeriod)
}
instance.Status.MachineConfigPools = syncMachineConfigPoolNodeGroupConfigStatuses(instance.Status.MachineConfigPools, trees)

return false, ctrl.Result{}, "", nil
return intreconcile.StepSuccess()
}

func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) ([]poolDaemonSet, bool, ctrl.Result, string, error) {
func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) ([]poolDaemonSet, intreconcile.Step) {
daemonSetsInfoPerMCP, err := r.syncNUMAResourcesOperatorResources(ctx, instance, trees)
if err != nil {
r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedRTECreate", "Failed to create Resource-Topology-Exporter DaemonSets: %v", err)
return nil, true, ctrl.Result{}, status.ConditionDegraded, fmt.Errorf("FailedRTESync: %w", err)
err = fmt.Errorf("FailedRTESync: %w", err)
return nil, intreconcile.StepFailed(err)
}
if len(daemonSetsInfoPerMCP) == 0 {
return nil, false, ctrl.Result{}, "", nil
return nil, intreconcile.StepSuccess()
}

r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulRTECreate", "Created Resource-Topology-Exporter DaemonSets")
Expand All @@ -252,36 +274,43 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context
instance.Status.DaemonSets = dssWithReadyStatus
instance.Status.RelatedObjects = relatedobjects.ResourceTopologyExporter(r.Namespace, dssWithReadyStatus)
if err != nil {
return nil, true, ctrl.Result{}, status.ConditionDegraded, err
return nil, intreconcile.StepFailed(err)
}
if !allDSsUpdated {
return nil, true, ctrl.Result{RequeueAfter: 5 * time.Second}, status.ConditionProgressing, nil
return nil, intreconcile.StepOngoing(5 * time.Second)
}

return daemonSetsInfoPerMCP, false, ctrl.Result{}, "", nil
return daemonSetsInfoPerMCP, intreconcile.StepSuccess()
}

func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (ctrl.Result, string, error) {
if done, res, cond, err := r.reconcileResourceAPI(ctx, instance, trees); done {
return res, cond, err
func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step {
if step := r.reconcileResourceAPI(ctx, instance, trees); step.EarlyStop() {
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
return step
}

if r.Platform == platform.OpenShift {
if done, res, cond, err := r.reconcileResourceMachineConfig(ctx, instance, trees); done {
return res, cond, err
if step := r.reconcileResourceMachineConfig(ctx, instance, trees); step.EarlyStop() {
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
return step
}
}

dsPerMCP, done, res, cond, err := r.reconcileResourceDaemonSet(ctx, instance, trees)
if done {
return res, cond, err
dsPerMCP, step := r.reconcileResourceDaemonSet(ctx, instance, trees)
if step.EarlyStop() {
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
return step
}

// all fields of NodeGroupStatus are required so publish the status only when all daemonset and MCPs are updated which
// is a certain thing if we got to this point otherwise the function would have returned already
instance.Status.NodeGroups = syncNodeGroupsStatus(instance, dsPerMCP)

return ctrl.Result{}, status.ConditionAvailable, nil
updateStatusConditionsIfNeeded(instance, conditioninfo.Available())
return intreconcile.Step{
Result: ctrl.Result{},
ConditionInfo: conditioninfo.Available(),
}
}

func (r *NUMAResourcesOperatorReconciler) syncDaemonSetsStatuses(ctx context.Context, rd client.Reader, daemonSetsInfo []poolDaemonSet) ([]nropv1.NamespacedName, bool, error) {
Expand Down
3 changes: 2 additions & 1 deletion controllers/numaresourcesoperator_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() {
})

Context("with node group with MCP selector that matches more than one MCP", func() {
It("should update the CR condition to degraded when annotation is not enabled but still creat all needed objects", func() {
It("should update the CR condition to degraded when annotation is not enabled but still create all needed objects", func() {
mcpName1 := "test1"
label1 := map[string]string{
"test1": "test1",
Expand Down Expand Up @@ -335,6 +335,7 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() {
Expect(availableCondition.Status).To(Equal(metav1.ConditionTrue))
})
})

Context("with correct NRO and SELinuxPolicyConfigAnnotation not set", func() {
It("should create all objects, RTE daemonsets and MCPs will get updated from the first reconcile iteration", func() {
mcpName := "test1"
Expand Down
2 changes: 1 addition & 1 deletion controllers/numaresourcesscheduler_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ func (r *NUMAResourcesSchedulerReconciler) Reconcile(ctx context.Context, req ct
}

result, condition, err := r.reconcileResource(ctx, instance)
if err := r.updateStatus(ctx, instance, condition, reasonFromError(err), messageFromError(err)); err != nil {
if err := r.updateStatus(ctx, instance, condition, status.ReasonFromError(err), status.MessageFromError(err)); err != nil {
klog.InfoS("Failed to update numaresourcesscheduler status", "Desired condition", condition, "error", err)
}

Expand Down
74 changes: 74 additions & 0 deletions internal/reconcile/step.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright 2024 Red Hat, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package reconcile

import (
"time"

ctrl "sigs.k8s.io/controller-runtime"

"github.com/openshift-kni/numaresources-operator/pkg/status"
"github.com/openshift-kni/numaresources-operator/pkg/status/conditioninfo"
)

type Step struct {
Result ctrl.Result
ConditionInfo conditioninfo.ConditionInfo
Error error
}

// Done returns true if a reconciliation step is fully completed,
// meaning all the substeps have been performed successfully, false otherwise
func (rs Step) Done() bool {
return rs.ConditionInfo.Type == status.ConditionAvailable
}

// EarlyStop returns true if we should shortcut after a reconcile substep,
// false otherwise
func (rs Step) EarlyStop() bool {
return rs.ConditionInfo.Type != status.ConditionAvailable
}

// StepSuccess returns a Step which tells the caller
// the reconciliation attempt was completed successfully
func StepSuccess() Step {
return Step{
Result: ctrl.Result{},
ConditionInfo: conditioninfo.Available(),
}
}

// StepOngoing returns a Step which tells the caller
// the reconciliation attempt needs to wait for external condition (e.g.
// MachineConfig updates and to retry to reconcile after the `after` value.
func StepOngoing(after time.Duration) Step {
return Step{
Result: ctrl.Result{RequeueAfter: after},
ConditionInfo: conditioninfo.Progressing(),
}
}

// StepFailed returns a Step which tells the caller
// the reconciliation attempt failed with the given error `err`, and
// that the NUMAResourcesOperator condition should be Degraded.
func StepFailed(err error) Step {
return Step{
Result: ctrl.Result{},
ConditionInfo: conditioninfo.DegradedFromError(err),
Error: err,
}
}
Loading

0 comments on commit 8d3b2ad

Please sign in to comment.