From 29c3de8cab811b1ec0ae2763fa649e57362e0fb6 Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Tue, 7 Jan 2025 09:31:21 +0100 Subject: [PATCH] [release-4.18] nrop: controller: get rtestate from cluster just once We used to get the current cluster state both when attempting to sync the machine config state in the OpenShift flow, and when attempting to sync the RTE daemonsets in both flows. Due to the way `rtestate.FromClient` works, we end up reading the full cluster state twice in the openshift flow, which is unnecessary and wasteful. The fix is to get the cluster state once in the higher level function and pass the state in the specific reconcile step functions. The machine config and daemonset functions touch not overlapping objects (bar bugs) so sharing the state between them is expected to be fine. In other words, we happen to fetch the state twice most likely because of oversight, not because a deliberate decision about data sharing/unsharing (which, should that be the case, would need some extra refactoring and more explicit code). Signed-off-by: Francesco Romani (cherry picked from commit 835a9a8e1acaaa29cf28dd54e354f4eae4422753) --- .../numaresourcesoperator_controller.go | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/controllers/numaresourcesoperator_controller.go b/controllers/numaresourcesoperator_controller.go index a5516f342..beb7a57da 100644 --- a/controllers/numaresourcesoperator_controller.go +++ b/controllers/numaresourcesoperator_controller.go @@ -240,10 +240,10 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Conte return intreconcile.StepSuccess() } -func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step { +func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx context.Context, instance *nropv1.NUMAResourcesOperator, existing *rtestate.ExistingManifests, trees []nodegroupv1.Tree) intreconcile.Step { // we need to sync machine configs first and wait for the MachineConfigPool updates // before checking additional components for updates - mcpUpdatedFunc, err := r.syncMachineConfigs(ctx, instance, trees) + mcpUpdatedFunc, err := r.syncMachineConfigs(ctx, instance, existing, trees) if err != nil { r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedMCSync", "Failed to set up machine configuration for worker nodes: %v", err) err = fmt.Errorf("failed to sync machine configs: %w", err) @@ -265,8 +265,8 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx con return intreconcile.StepSuccess() } -func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) ([]poolDaemonSet, intreconcile.Step) { - daemonSetsInfoPerPool, err := r.syncNUMAResourcesOperatorResources(ctx, instance, trees) +func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context.Context, instance *nropv1.NUMAResourcesOperator, existing *rtestate.ExistingManifests, trees []nodegroupv1.Tree) ([]poolDaemonSet, intreconcile.Step) { + daemonSetsInfoPerPool, err := r.syncNUMAResourcesOperatorResources(ctx, instance, existing, trees) if err != nil { r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedRTECreate", "Failed to create Resource-Topology-Exporter DaemonSets: %v", err) err = fmt.Errorf("FailedRTESync: %w", err) @@ -298,14 +298,16 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, return step } + existing := rtestate.FromClient(ctx, r.Client, r.Platform, r.RTEManifests, instance, trees, r.Namespace) + if r.Platform == platform.OpenShift { - if step := r.reconcileResourceMachineConfig(ctx, instance, trees); step.EarlyStop() { + if step := r.reconcileResourceMachineConfig(ctx, instance, &existing, trees); step.EarlyStop() { updateStatusConditionsIfNeeded(instance, step.ConditionInfo) return step } } - dsPerPool, step := r.reconcileResourceDaemonSet(ctx, instance, trees) + dsPerPool, step := r.reconcileResourceDaemonSet(ctx, instance, &existing, trees) if step.EarlyStop() { updateStatusConditionsIfNeeded(instance, step.ConditionInfo) return step @@ -402,12 +404,10 @@ func (r *NUMAResourcesOperatorReconciler) syncNodeResourceTopologyAPI(ctx contex return (updatedCount == len(objStates)), err } -func (r *NUMAResourcesOperatorReconciler) syncMachineConfigs(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) (rtestate.MCPWaitForUpdatedFunc, error) { +func (r *NUMAResourcesOperatorReconciler) syncMachineConfigs(ctx context.Context, instance *nropv1.NUMAResourcesOperator, existing *rtestate.ExistingManifests, trees []nodegroupv1.Tree) (rtestate.MCPWaitForUpdatedFunc, error) { klog.V(4).InfoS("Machine Config Sync start", "trees", len(trees)) defer klog.V(4).Info("Machine Config Sync stop") - existing := rtestate.FromClient(ctx, r.Client, r.Platform, r.RTEManifests, instance, trees, r.Namespace) - var err error // Since 4.18 we're using a built-in SELinux policy, // so the MachineConfig which applies the custom policy is no longer necessary. @@ -506,7 +506,7 @@ func getMachineConfigPoolStatusByName(mcpStatuses []nropv1.MachineConfigPool, na return nropv1.MachineConfigPool{Name: name} } -func (r *NUMAResourcesOperatorReconciler) syncNUMAResourcesOperatorResources(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) ([]poolDaemonSet, error) { +func (r *NUMAResourcesOperatorReconciler) syncNUMAResourcesOperatorResources(ctx context.Context, instance *nropv1.NUMAResourcesOperator, existing *rtestate.ExistingManifests, trees []nodegroupv1.Tree) ([]poolDaemonSet, error) { klog.V(4).InfoS("RTESync start", "trees", len(trees)) defer klog.V(4).Info("RTESync stop") @@ -558,7 +558,6 @@ func (r *NUMAResourcesOperatorReconciler) syncNUMAResourcesOperatorResources(ctx return nil } - existing := rtestate.FromClient(ctx, r.Client, r.Platform, r.RTEManifests, instance, trees, r.Namespace) for _, objState := range existing.State(r.RTEManifests, processor, annotations.IsCustomPolicyEnabled(instance.Annotations)) { if objState.Error != nil { // We are likely in the bootstrap scenario. In this case, which is expected once, everything is fine.