From 4346df0b18f6f512796ac22ca5e8ddcd541ab7a3 Mon Sep 17 00:00:00 2001 From: Juan Hernandez Date: Mon, 11 Nov 2024 10:52:45 +0100 Subject: [PATCH] MGMT-19148: Wait for OLM operator setup jobs When operators are automatically enabled from assisted installer the custom manifests may contain Kubernetes jobs to complete the setup. For example, a hypothetical storage operator can include in the custom manifests a job that waits for the storage class and marks it as the default. Currently the installer doesn't wait for jobs to finish, and it will declare the cluster to be ready even if those jobs haven't been completed. This patch changes the controller so that it assumes that jobs that have the `agent-install.openshift.io/setup-job` label are such setup jobs, and waits for them to finish. The value of the label must be the name of the corresponding operator. Related: https://issues.redhat.com/browse/MGMT-19148 Signed-off-by: Juan Hernandez --- .../assisted_installer_controller.go | 162 ++++++++++++++-- .../assisted_installer_controller_test.go | 178 ++++++++++++++++++ .../operator_handler.go | 17 +- 3 files changed, 343 insertions(+), 14 deletions(-) diff --git a/src/assisted_installer_controller/assisted_installer_controller.go b/src/assisted_installer_controller/assisted_installer_controller.go index 42cac9c2e4..1cb9648ae0 100644 --- a/src/assisted_installer_controller/assisted_installer_controller.go +++ b/src/assisted_installer_controller/assisted_installer_controller.go @@ -13,6 +13,7 @@ import ( "net/http" "os" "path" + "sort" "strings" "sync" "sync/atomic" @@ -25,6 +26,7 @@ import ( configv1 "github.com/openshift/api/config/v1" "github.com/pkg/errors" "github.com/sirupsen/logrus" + batchV1 "k8s.io/api/batch/v1" certificatesv1 "k8s.io/api/certificates/v1" v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -63,6 +65,11 @@ const ( clusterOperatorReportKey string = "CLUSTER_OPERATORS_REPORT" workerMCPName = "worker" roleLabel = "node-role.kubernetes.io" + + // This is the label that should be added to jobs that are used to complete the setup of OLM operators. The + // isntaller will wait till those jobs are completed before declaring the operator available. The value should + // be the identifier of the operator. + agentInstallSetupJobLabel = "agent-install.openshift.io/setup-job" ) var ( @@ -121,12 +128,14 @@ type Controller interface { type controller struct { ControllerConfig - status *ControllerStatus - log *logrus.Logger - ops ops.Ops - ic inventory_client.InventoryClient - kc k8s_client.K8SClient - rebootsNotifier RebootsNotifier + status *ControllerStatus + log *logrus.Logger + ops ops.Ops + ic inventory_client.InventoryClient + kc k8s_client.K8SClient + rebootsNotifier RebootsNotifier + setupJobCountsLock *sync.Mutex + setupJobCounts map[string]int } // manifest store the operator manifest used by assisted-installer to create CRs of the OLM: @@ -139,13 +148,15 @@ type manifest struct { func newController(log *logrus.Logger, cfg ControllerConfig, ops ops.Ops, ic inventory_client.InventoryClient, kc k8s_client.K8SClient, rebootsNotifier RebootsNotifier) *controller { return &controller{ - log: log, - ControllerConfig: cfg, - ops: ops, - ic: ic, - kc: kc, - status: NewControllerStatus(), - rebootsNotifier: rebootsNotifier, + log: log, + ControllerConfig: cfg, + ops: ops, + ic: ic, + kc: kc, + status: NewControllerStatus(), + rebootsNotifier: rebootsNotifier, + setupJobCountsLock: &sync.Mutex{}, + setupJobCounts: map[string]int{}, } } @@ -587,6 +598,15 @@ func (c *controller) waitForOLMOperators(ctx context.Context) error { errs = append(errs, err) } + // Some of the post install manifests may be jobs to complete the setup of the operators. For example, a storage + // operator may include a job that waits for the default storage class and then marks it as the default. We need + // to check and remember that. + err = c.refreshSetupJobCounts() + if err != nil { + c.log.WithError(err).Warning("Failed to check setup jobs") + errs = append(errs, err) + } + c.updateFinalizingProgress(ctx, models.FinalizingStageWaitingForOlmOperatorsCsv) err = c.waitForCSV(ctx) if err != nil { @@ -595,6 +615,22 @@ func (c *controller) waitForOLMOperators(ctx context.Context) error { errs = append(errs, err) } + + // If there are any setup jobs then wait for them to finish and then mark the corresponding operator as + // available: + totalSetupJobCount := 0 + for _, setupJobCount := range c.setupJobCounts { + totalSetupJobCount += setupJobCount + } + if totalSetupJobCount > 0 { + c.updateFinalizingProgress(ctx, models.FinalizingStageWaitingForOLMOperatorSetupJobs) + err = c.waitForSetupJobs(ctx) + if err != nil { + c.log.WithError(err).Warning("Failed to wait for setup jobs") + errs = append(errs, err) + } + } + return stderrors.Join(errs...) } @@ -1506,3 +1542,103 @@ func (c controller) SetReadyState(waitTimeout time.Duration) *models.Cluster { func (c *controller) GetStatus() *ControllerStatus { return c.status } + +// waitForSetupJobs waits till the jobs are completed successfully. +func (c *controller) waitForSetupJobs(ctx context.Context) error { + c.log.Info("Waiting for setup jobs") + operators, err := c.getProgressingOLMOperators() + if err != nil { + return err + } + areAllSetupJobsComplete := func() bool { + err := c.refreshSetupJobCounts() + if err != nil { + c.log.WithError(err).Error("Failed to refresh setup job counts") + return false + } + totalSetupJobCount := 0 + for _, operator := range operators { + setupJobCount := c.getSetupJobCount(operator.Name) + totalSetupJobCount += setupJobCount + if setupJobCount == 0 { + err = c.ic.UpdateClusterOperator( + ctx, + c.ClusterID, + operator.Name, + operator.Version, + models.OperatorStatusAvailable, + "Setup jobs completed", + ) + if err != nil { + c.log.WithError(err).Error("Failed to update operator") + return false + } + c.log.WithFields(logrus.Fields{ + "name": operator.Name, + }).Info("Operator is available because setup jobs are completed") + } + } + return totalSetupJobCount == 0 + } + return utils.WaitForeverForPredicateWithCancel( + ctx, + GeneralProgressUpdateInt, + areAllSetupJobsComplete, + c.isStageTimedOut(models.FinalizingStageWaitingForOLMOperatorSetupJobs), + ) +} + +// refreshSetupJobsCounts updatest the internal data structure that contains the counts of setup jobs. +func (c *controller) refreshSetupJobCounts() error { + c.setupJobCountsLock.Lock() + defer c.setupJobCountsLock.Unlock() + for k := range c.setupJobCounts { + delete(c.setupJobCounts, k) + } + list, err := c.kc.ListJobs("", metav1.ListOptions{ + LabelSelector: agentInstallSetupJobLabel, + }) + if err != nil { + return err + } + for i := range list.Items { + item := &list.Items[i] + name := item.Labels[agentInstallSetupJobLabel] + count := c.setupJobCounts[name] + var completed bool + completed, err = c.checkSetupJob(item) + if err != nil { + return err + } + if !completed { + count++ + } + c.setupJobCounts[name] = count + } + names := make([]string, len(c.setupJobCounts)) + i := 0 + for name := range c.setupJobCounts { + names[i] = name + } + sort.Strings(names) + for _, name := range names { + c.log.WithFields(logrus.Fields{ + "name": name, + "count": c.setupJobCounts[name], + }).Info("Setup jobs") + } + return nil +} + +// getSetupJobCount returns the number of setup jobs for the given operator. +func (c *controller) getSetupJobCount(name string) int { + c.setupJobCountsLock.Lock() + defer c.setupJobCountsLock.Unlock() + return c.setupJobCounts[name] +} + +// checkSetupJob checks if the given setup job has been completed. +func (c *controller) checkSetupJob(job *batchV1.Job) (result bool, err error) { + result = job.Status.Succeeded > 0 + return +} diff --git a/src/assisted_installer_controller/assisted_installer_controller_test.go b/src/assisted_installer_controller/assisted_installer_controller_test.go index 207631d1ec..165dbe6f37 100644 --- a/src/assisted_installer_controller/assisted_installer_controller_test.go +++ b/src/assisted_installer_controller/assisted_installer_controller_test.go @@ -1075,9 +1075,16 @@ var _ = Describe("installer HostRoleMaster role", func() { logClusterOperatorsSuccess() }) + mockNoSetupJobs := func() { + mockk8sclient.EXPECT().ListJobs("", metav1.ListOptions{ + LabelSelector: agentInstallSetupJobLabel, + }).Return(&batchV1.JobList{}, nil).AnyTimes() + } + It("waiting for single OLM operator", func() { mockAllCapabilitiesEnabled() mockGetClusterForCancel() + mockNoSetupJobs() By("setup", func() { setControllerWaitForOLMOperators(assistedController.ClusterID) @@ -1140,6 +1147,7 @@ var _ = Describe("installer HostRoleMaster role", func() { It("waiting for single OLM operator which timeouts", func() { mockAllCapabilitiesEnabled() mockGetClusterForCancel() + mockNoSetupJobs() By("setup", func() { setControllerWaitForOLMOperators(assistedController.ClusterID) @@ -1178,6 +1186,7 @@ var _ = Describe("installer HostRoleMaster role", func() { It("waiting for single OLM operator which continues installation after timeout occurs", func() { mockAllCapabilitiesEnabled() + mockNoSetupJobs() By("setup", func() { setControllerWaitForOLMOperators(assistedController.ClusterID) @@ -1232,6 +1241,175 @@ var _ = Describe("installer HostRoleMaster role", func() { Expect(assistedController.status.HasError()).Should(Equal(false)) Expect(assistedController.status.GetOperatorsInError()).To(ContainElement("lso")) }) + + It("Waiting for setup job", func() { + mockAllCapabilitiesEnabled() + mockGetClusterForCancel() + + operator := models.MonitoredOperator{ + SubscriptionName: "local-storage-operator", + Namespace: "openshift-local-storage", + OperatorType: models.OperatorTypeOlm, + Name: "lso", + TimeoutSeconds: 120 * 60, + } + + By("Setup", func() { + setControllerWaitForOLMOperators(assistedController.ClusterID) + mockGetOLMOperators([]models.MonitoredOperator{operator}) + mockApplyPostInstallManifests([]models.MonitoredOperator{operator}) + mockk8sclient.EXPECT().GetCSVFromSubscription( + operator.Namespace, + operator.SubscriptionName, + ).Return("local-storage-operator", nil).Times(2) + mockk8sclient.EXPECT().GetCSV( + operator.Namespace, + operator.SubscriptionName, + ).Return( + &olmv1alpha1.ClusterServiceVersion{ + Status: olmv1alpha1.ClusterServiceVersionStatus{ + Phase: olmv1alpha1.CSVPhaseNone, + }, + }, + nil, + ).Times(2) + + mockk8sclient.EXPECT().ListJobs( + "", + metav1.ListOptions{ + LabelSelector: agentInstallSetupJobLabel, + }, + ).Return( + &batchV1.JobList{ + Items: []batchV1.Job{{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + agentInstallSetupJobLabel: operator.Name, + }, + }, + Status: batchV1.JobStatus{ + Succeeded: 0, + }, + }}, + }, + nil, + ).Times(1) + }) + + By("Empty status", func() { + operator.Status = "" + mockGetServiceOperators([]models.MonitoredOperator{operator}) + mockGetCSV( + operator, + &olmv1alpha1.ClusterServiceVersion{ + Status: olmv1alpha1.ClusterServiceVersionStatus{ + Phase: olmv1alpha1.CSVPhaseInstalling, + }, + }, + ) + }) + + By("In progress", func() { + operator.Status = models.OperatorStatusProgressing + mockGetServiceOperators([]models.MonitoredOperator{operator}) + mockGetCSV( + operator, + &olmv1alpha1.ClusterServiceVersion{ + Status: olmv1alpha1.ClusterServiceVersionStatus{ + Phase: olmv1alpha1.CSVPhaseInstalling, + }, + }, + ) + mockbmclient.EXPECT().UpdateClusterOperator( + gomock.Any(), + "cluster-id", + "lso", + gomock.Any(), + models.OperatorStatusProgressing, + gomock.Any(), + ).Return(nil).Times(1) + }) + + By("Available but with setup jobs", func() { + operator.Status = models.OperatorStatusProgressing + mockGetServiceOperators([]models.MonitoredOperator{operator}) + mockGetCSV( + operator, + &olmv1alpha1.ClusterServiceVersion{ + Status: olmv1alpha1.ClusterServiceVersionStatus{ + Phase: olmv1alpha1.CSVPhaseSucceeded, + }, + }, + ) + mockbmclient.EXPECT().UpdateClusterOperator( + gomock.Any(), + "cluster-id", + "lso", + gomock.Any(), + models.OperatorStatusProgressing, + gomock.Any(), + ).Return(nil).Times(1) + }) + + By("Available and no setup jobs", func() { + operator.Status = models.OperatorStatusProgressing + mockGetOLMOperators([]models.MonitoredOperator{operator}) + mockk8sclient.EXPECT().ListJobs( + "", + metav1.ListOptions{ + LabelSelector: agentInstallSetupJobLabel, + }, + ).Return( + &batchV1.JobList{ + Items: []batchV1.Job{{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + agentInstallSetupJobLabel: "lso", + }, + }, + Status: batchV1.JobStatus{ + Succeeded: 1, + }, + }}, + }, + nil, + ).Times(1) + mockbmclient.EXPECT().UpdateClusterOperator( + gomock.Any(), + "cluster-id", + "lso", + gomock.Any(), + models.OperatorStatusAvailable, + gomock.Any(), + ).Return(nil).Times(1) + }) + + mockbmclient.EXPECT().CompleteInstallation( + gomock.Any(), + "cluster-id", + true, + "", + nil, + ).Return(fmt.Errorf("dummy")).Times(1) + mockbmclient.EXPECT().CompleteInstallation(gomock.Any(), "cluster-id", true, "", nil).Return(nil).Times(1) + mockSuccessUpdateFinalizingStages( + models.FinalizingStageWaitingForClusterOperators, + models.FinalizingStageAddingRouterCa, + models.FinalizingStageWaitingForOlmOperatorsCsvInitialization, + models.FinalizingStageApplyingOlmManifests, + models.FinalizingStageWaitingForOlmOperatorsCsv, + models.FinalizingStageWaitingForOLMOperatorSetupJobs, + models.FinalizingStageDone, + ) + + wg.Add(1) + mockk8sclient.EXPECT().ListJobs(olmNamespace, metav1.ListOptions{}).Return(&batchV1.JobList{}, nil).AnyTimes() + mockk8sclient.EXPECT().GetAllInstallPlansOfSubscription(gomock.Any()).Return([]olmv1alpha1.InstallPlan{}, nil).AnyTimes() + assistedController.PostInstallConfigs(context.TODO(), &wg) + wg.Wait() + Expect(assistedController.status.HasError()).Should(Equal(false)) + Expect(assistedController.status.HasOperatorError()).Should(Equal(false)) + }) }) Context("Patching node labels", func() { diff --git a/src/assisted_installer_controller/operator_handler.go b/src/assisted_installer_controller/operator_handler.go index 777e64ce3d..c36e7bb0c3 100644 --- a/src/assisted_installer_controller/operator_handler.go +++ b/src/assisted_installer_controller/operator_handler.go @@ -49,7 +49,22 @@ func (c *controller) isOperatorAvailable(handler OperatorHandler) bool { return false } - err = c.ic.UpdateClusterOperator(context.TODO(), c.ClusterID, operatorName, operatorVersion, operatorStatus, operatorMessage) + // If the operator is available but has setup jobs then we don't want to mark it as available here + // because if we do that for all operators then the service will consider that the cluster is ready + // and that will in turn stop the controller without first checking if the setup jobs have finished. + // So instead we mark the operator as progressing, and will mark it as available later, when we check + // the setup jobs. + var reportedStatus models.OperatorStatus + if operatorStatus == models.OperatorStatusAvailable && c.getSetupJobCount(operatorName) > 0 { + c.log.WithFields(logrus.Fields{ + "name": operatorName, + }).Info("Operator is available but has setup jobs", operatorName) + reportedStatus = models.OperatorStatusProgressing + } else { + reportedStatus = operatorStatus + } + + err = c.ic.UpdateClusterOperator(context.TODO(), c.ClusterID, operatorName, operatorVersion, reportedStatus, operatorMessage) if err != nil { c.log.WithError(err).Warnf("Failed to update %s operator status %s with message %s", operatorName, operatorStatus, operatorMessage) return false