Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cleanup and streamline status computation #1032

Merged
merged 10 commits into from
Nov 20, 2024
37 changes: 0 additions & 37 deletions controllers/controllers.go

This file was deleted.

111 changes: 70 additions & 41 deletions controllers/numaresourcesoperator_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ import (
rtestate "github.com/openshift-kni/numaresources-operator/pkg/objectstate/rte"
rteupdate "github.com/openshift-kni/numaresources-operator/pkg/objectupdate/rte"
"github.com/openshift-kni/numaresources-operator/pkg/status"
"github.com/openshift-kni/numaresources-operator/pkg/status/conditioninfo"
"github.com/openshift-kni/numaresources-operator/pkg/validation"

intreconcile "github.com/openshift-kni/numaresources-operator/internal/reconcile"
)

const numaResourcesRetryPeriod = 1 * time.Minute
Expand Down Expand Up @@ -140,54 +143,70 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr

if req.Name != objectnames.DefaultNUMAResourcesOperatorCrName {
err := fmt.Errorf("incorrect NUMAResourcesOperator resource name: %s", instance.Name)
return r.updateStatus(ctx, instance, status.ConditionDegraded, status.ConditionTypeIncorrectNUMAResourcesOperatorResourceName, err)
return r.degradeStatus(ctx, instance, status.ConditionTypeIncorrectNUMAResourcesOperatorResourceName, err)
}

if err := validation.NodeGroups(instance.Spec.NodeGroups); err != nil {
return r.updateStatus(ctx, instance, status.ConditionDegraded, validation.NodeGroupsError, err)
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
}

trees, err := getTreesByNodeGroup(ctx, r.Client, instance.Spec.NodeGroups)
if err != nil {
return r.updateStatus(ctx, instance, status.ConditionDegraded, validation.NodeGroupsError, err)
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
}

multiMCPsErr := validation.MultipleMCPsPerTree(instance.Annotations, trees)

if err := validation.MachineConfigPoolDuplicates(trees); err != nil {
return r.updateStatus(ctx, instance, status.ConditionDegraded, validation.NodeGroupsError, err)
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
}

for idx := range trees {
conf := trees[idx].NodeGroup.NormalizeConfig()
trees[idx].NodeGroup.Config = &conf
}

result, condition, err := r.reconcileResource(ctx, instance, trees)
if condition != "" {
if condition == status.ConditionAvailable && multiMCPsErr != nil {
_, _ = r.updateStatus(ctx, instance, status.ConditionDegraded, validation.NodeGroupsError, multiMCPsErr)
} else {
_, _ = r.updateStatus(ctx, instance, condition, reasonFromError(err), err)
}
curStatus := instance.Status.DeepCopy()

step := r.reconcileResource(ctx, instance, trees)

if step.Done() && multiMCPsErr != nil {
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, multiMCPsErr)
}

if !status.IsUpdatedNUMAResourcesOperator(curStatus, &instance.Status) {
return step.Result, step.Error
}
return result, err

updErr := r.Client.Status().Update(ctx, instance)
if updErr != nil {
shajmakh marked this conversation as resolved.
Show resolved Hide resolved
klog.InfoS("Failed to update numaresourcesoperator status", "error", updErr)
return ctrl.Result{}, fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), updErr)
}

return step.Result, step.Error
}

// updateStatusConditionsIfNeeded returns true if conditions were updated.
func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, condition string, reason string, message string) bool {
klog.InfoS("updateStatus", "condition", condition, "reason", reason, "message", message)
conditions, ok := status.UpdateConditions(instance.Status.Conditions, condition, reason, message)
func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, cond conditioninfo.ConditionInfo) {
if cond.Type == "" { // backward (=legacy) compatibility
return
}
klog.InfoS("updateStatus", "condition", cond.Type, "reason", cond.Reason, "message", cond.Message)
conditions, ok := status.UpdateConditions(instance.Status.Conditions, cond.Type, cond.Reason, cond.Message)
if ok {
instance.Status.Conditions = conditions
}
return ok
}

func (r *NUMAResourcesOperatorReconciler) updateStatus(ctx context.Context, instance *nropv1.NUMAResourcesOperator, condition string, reason string, stErr error) (ctrl.Result, error) {
message := messageFromError(stErr)
func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, instance *nropv1.NUMAResourcesOperator, reason string, stErr error) (ctrl.Result, error) {
info := conditioninfo.DegradedFromError(stErr)
if reason != "" { // intentionally overwrite
info.Reason = reason
}

_ = updateStatusConditionsIfNeeded(instance, condition, reason, message)
updateStatusConditionsIfNeeded(instance, info)
// TODO: if we keep being degraded, we likely (= if we don't, it's too implicit) keep sending possibly redundant updates to the apiserver

err := r.Client.Status().Update(ctx, instance)
if err != nil {
Expand All @@ -200,25 +219,27 @@ func (r *NUMAResourcesOperatorReconciler) updateStatus(ctx context.Context, inst
return ctrl.Result{}, nil
}

func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (bool, ctrl.Result, string, error) {
func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step {
applied, err := r.syncNodeResourceTopologyAPI(ctx)
if err != nil {
r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedCRDInstall", "Failed to install Node Resource Topology CRD: %v", err)
return true, ctrl.Result{}, status.ConditionDegraded, fmt.Errorf("FailedAPISync: %w", err)
err = fmt.Errorf("FailedAPISync: %w", err)
return intreconcile.StepFailed(err)
}
if applied {
r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulCRDInstall", "Node Resource Topology CRD installed")
}
return false, ctrl.Result{}, "", nil
return intreconcile.StepSuccess()
}

func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (bool, ctrl.Result, string, error) {
func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step {
// we need to sync machine configs first and wait for the MachineConfigPool updates
// before checking additional components for updates
mcpUpdatedFunc, err := r.syncMachineConfigs(ctx, instance, trees)
if err != nil {
r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedMCSync", "Failed to set up machine configuration for worker nodes: %v", err)
return true, ctrl.Result{}, status.ConditionDegraded, fmt.Errorf("failed to sync machine configs: %w", err)
err = fmt.Errorf("failed to sync machine configs: %w", err)
return intreconcile.StepFailed(err)
}
r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulMCSync", "Enabled machine configuration for worker nodes")

Expand All @@ -229,21 +250,22 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx con

if !allMCPsUpdated {
// the Machine Config Pool still did not apply the machine config, wait for one minute
return true, ctrl.Result{RequeueAfter: numaResourcesRetryPeriod}, status.ConditionProgressing, nil
return intreconcile.StepOngoing(numaResourcesRetryPeriod)
}
instance.Status.MachineConfigPools = syncMachineConfigPoolNodeGroupConfigStatuses(instance.Status.MachineConfigPools, trees)

return false, ctrl.Result{}, "", nil
return intreconcile.StepSuccess()
}

func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) ([]poolDaemonSet, bool, ctrl.Result, string, error) {
func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) ([]poolDaemonSet, intreconcile.Step) {
daemonSetsInfoPerMCP, err := r.syncNUMAResourcesOperatorResources(ctx, instance, trees)
if err != nil {
r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedRTECreate", "Failed to create Resource-Topology-Exporter DaemonSets: %v", err)
return nil, true, ctrl.Result{}, status.ConditionDegraded, fmt.Errorf("FailedRTESync: %w", err)
err = fmt.Errorf("FailedRTESync: %w", err)
return nil, intreconcile.StepFailed(err)
}
if len(daemonSetsInfoPerMCP) == 0 {
return nil, false, ctrl.Result{}, "", nil
return nil, intreconcile.StepSuccess()
}

r.Recorder.Eventf(instance, corev1.EventTypeNormal, "SuccessfulRTECreate", "Created Resource-Topology-Exporter DaemonSets")
Expand All @@ -252,36 +274,43 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context
instance.Status.DaemonSets = dssWithReadyStatus
instance.Status.RelatedObjects = relatedobjects.ResourceTopologyExporter(r.Namespace, dssWithReadyStatus)
if err != nil {
return nil, true, ctrl.Result{}, status.ConditionDegraded, err
return nil, intreconcile.StepFailed(err)
}
if !allDSsUpdated {
return nil, true, ctrl.Result{RequeueAfter: 5 * time.Second}, status.ConditionProgressing, nil
return nil, intreconcile.StepOngoing(5 * time.Second)
}

return daemonSetsInfoPerMCP, false, ctrl.Result{}, "", nil
return daemonSetsInfoPerMCP, intreconcile.StepSuccess()
}

func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (ctrl.Result, string, error) {
if done, res, cond, err := r.reconcileResourceAPI(ctx, instance, trees); done {
return res, cond, err
func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step {
if step := r.reconcileResourceAPI(ctx, instance, trees); step.EarlyStop() {
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
return step
}

if r.Platform == platform.OpenShift {
if done, res, cond, err := r.reconcileResourceMachineConfig(ctx, instance, trees); done {
return res, cond, err
if step := r.reconcileResourceMachineConfig(ctx, instance, trees); step.EarlyStop() {
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
return step
}
}

dsPerMCP, done, res, cond, err := r.reconcileResourceDaemonSet(ctx, instance, trees)
if done {
return res, cond, err
dsPerMCP, step := r.reconcileResourceDaemonSet(ctx, instance, trees)
if step.EarlyStop() {
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
return step
}

// all fields of NodeGroupStatus are required so publish the status only when all daemonset and MCPs are updated which
// is a certain thing if we got to this point otherwise the function would have returned already
instance.Status.NodeGroups = syncNodeGroupsStatus(instance, dsPerMCP)

return ctrl.Result{}, status.ConditionAvailable, nil
updateStatusConditionsIfNeeded(instance, conditioninfo.Available())
return intreconcile.Step{
Result: ctrl.Result{},
ConditionInfo: conditioninfo.Available(),
}
}

func (r *NUMAResourcesOperatorReconciler) syncDaemonSetsStatuses(ctx context.Context, rd client.Reader, daemonSetsInfo []poolDaemonSet) ([]nropv1.NamespacedName, bool, error) {
Expand Down
3 changes: 2 additions & 1 deletion controllers/numaresourcesoperator_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() {
})

Context("with node group with MCP selector that matches more than one MCP", func() {
It("should update the CR condition to degraded when annotation is not enabled but still creat all needed objects", func() {
It("should update the CR condition to degraded when annotation is not enabled but still create all needed objects", func() {
mcpName1 := "test1"
label1 := map[string]string{
"test1": "test1",
Expand Down Expand Up @@ -336,6 +336,7 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() {
Expect(availableCondition.Status).To(Equal(metav1.ConditionTrue))
})
})

Context("with correct NRO and SELinuxPolicyConfigAnnotation not set", func() {
It("should create all objects, RTE daemonsets and MCPs will get updated from the first reconcile iteration", func() {
mcpName := "test1"
Expand Down
2 changes: 1 addition & 1 deletion controllers/numaresourcesscheduler_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ func (r *NUMAResourcesSchedulerReconciler) Reconcile(ctx context.Context, req ct
}

result, condition, err := r.reconcileResource(ctx, instance)
if err := r.updateStatus(ctx, instance, condition, reasonFromError(err), messageFromError(err)); err != nil {
if err := r.updateStatus(ctx, instance, condition, status.ReasonFromError(err), status.MessageFromError(err)); err != nil {
klog.InfoS("Failed to update numaresourcesscheduler status", "Desired condition", condition, "error", err)
}

Expand Down
74 changes: 74 additions & 0 deletions internal/reconcile/step.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright 2024 Red Hat, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package reconcile

import (
"time"

ctrl "sigs.k8s.io/controller-runtime"

"github.com/openshift-kni/numaresources-operator/pkg/status"
"github.com/openshift-kni/numaresources-operator/pkg/status/conditioninfo"
)

type Step struct {
Result ctrl.Result
ConditionInfo conditioninfo.ConditionInfo
Error error
}

// Done returns true if a reconciliation step is fully completed,
// meaning all the substeps have been performed successfully, false otherwise
func (rs Step) Done() bool {
return rs.ConditionInfo.Type == status.ConditionAvailable
}

// EarlyStop returns true if we should shortcut after a reconcile substep,
// false otherwise
func (rs Step) EarlyStop() bool {
return rs.ConditionInfo.Type != status.ConditionAvailable
}

// StepSuccess returns a Step which tells the caller
// the reconciliation attempt was completed successfully
func StepSuccess() Step {
return Step{
Result: ctrl.Result{},
ConditionInfo: conditioninfo.Available(),
}
}

// StepOngoing returns a Step which tells the caller
// the reconciliation attempt needs to wait for external condition (e.g.
// MachineConfig updates and to retry to reconcile after the `after` value.
func StepOngoing(after time.Duration) Step {
return Step{
Result: ctrl.Result{RequeueAfter: after},
ConditionInfo: conditioninfo.Progressing(),
}
}

// StepFailed returns a Step which tells the caller
// the reconciliation attempt failed with the given error `err`, and
// that the NUMAResourcesOperator condition should be Degraded.
func StepFailed(err error) Step {
return Step{
Result: ctrl.Result{},
ConditionInfo: conditioninfo.DegradedFromError(err),
Error: err,
}
}
Loading