diff --git a/src/assisted_installer_controller/assisted_installer_controller.go b/src/assisted_installer_controller/assisted_installer_controller.go index 42cac9c2e4..1cb9648ae0 100644 --- a/src/assisted_installer_controller/assisted_installer_controller.go +++ b/src/assisted_installer_controller/assisted_installer_controller.go @@ -13,6 +13,7 @@ import ( "net/http" "os" "path" + "sort" "strings" "sync" "sync/atomic" @@ -25,6 +26,7 @@ import ( configv1 "github.com/openshift/api/config/v1" "github.com/pkg/errors" "github.com/sirupsen/logrus" + batchV1 "k8s.io/api/batch/v1" certificatesv1 "k8s.io/api/certificates/v1" v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -63,6 +65,11 @@ const ( clusterOperatorReportKey string = "CLUSTER_OPERATORS_REPORT" workerMCPName = "worker" roleLabel = "node-role.kubernetes.io" + + // This is the label that should be added to jobs that are used to complete the setup of OLM operators. The + // isntaller will wait till those jobs are completed before declaring the operator available. The value should + // be the identifier of the operator. + agentInstallSetupJobLabel = "agent-install.openshift.io/setup-job" ) var ( @@ -121,12 +128,14 @@ type Controller interface { type controller struct { ControllerConfig - status *ControllerStatus - log *logrus.Logger - ops ops.Ops - ic inventory_client.InventoryClient - kc k8s_client.K8SClient - rebootsNotifier RebootsNotifier + status *ControllerStatus + log *logrus.Logger + ops ops.Ops + ic inventory_client.InventoryClient + kc k8s_client.K8SClient + rebootsNotifier RebootsNotifier + setupJobCountsLock *sync.Mutex + setupJobCounts map[string]int } // manifest store the operator manifest used by assisted-installer to create CRs of the OLM: @@ -139,13 +148,15 @@ type manifest struct { func newController(log *logrus.Logger, cfg ControllerConfig, ops ops.Ops, ic inventory_client.InventoryClient, kc k8s_client.K8SClient, rebootsNotifier RebootsNotifier) *controller { return &controller{ - log: log, - ControllerConfig: cfg, - ops: ops, - ic: ic, - kc: kc, - status: NewControllerStatus(), - rebootsNotifier: rebootsNotifier, + log: log, + ControllerConfig: cfg, + ops: ops, + ic: ic, + kc: kc, + status: NewControllerStatus(), + rebootsNotifier: rebootsNotifier, + setupJobCountsLock: &sync.Mutex{}, + setupJobCounts: map[string]int{}, } } @@ -587,6 +598,15 @@ func (c *controller) waitForOLMOperators(ctx context.Context) error { errs = append(errs, err) } + // Some of the post install manifests may be jobs to complete the setup of the operators. For example, a storage + // operator may include a job that waits for the default storage class and then marks it as the default. We need + // to check and remember that. + err = c.refreshSetupJobCounts() + if err != nil { + c.log.WithError(err).Warning("Failed to check setup jobs") + errs = append(errs, err) + } + c.updateFinalizingProgress(ctx, models.FinalizingStageWaitingForOlmOperatorsCsv) err = c.waitForCSV(ctx) if err != nil { @@ -595,6 +615,22 @@ func (c *controller) waitForOLMOperators(ctx context.Context) error { errs = append(errs, err) } + + // If there are any setup jobs then wait for them to finish and then mark the corresponding operator as + // available: + totalSetupJobCount := 0 + for _, setupJobCount := range c.setupJobCounts { + totalSetupJobCount += setupJobCount + } + if totalSetupJobCount > 0 { + c.updateFinalizingProgress(ctx, models.FinalizingStageWaitingForOLMOperatorSetupJobs) + err = c.waitForSetupJobs(ctx) + if err != nil { + c.log.WithError(err).Warning("Failed to wait for setup jobs") + errs = append(errs, err) + } + } + return stderrors.Join(errs...) } @@ -1506,3 +1542,103 @@ func (c controller) SetReadyState(waitTimeout time.Duration) *models.Cluster { func (c *controller) GetStatus() *ControllerStatus { return c.status } + +// waitForSetupJobs waits till the jobs are completed successfully. +func (c *controller) waitForSetupJobs(ctx context.Context) error { + c.log.Info("Waiting for setup jobs") + operators, err := c.getProgressingOLMOperators() + if err != nil { + return err + } + areAllSetupJobsComplete := func() bool { + err := c.refreshSetupJobCounts() + if err != nil { + c.log.WithError(err).Error("Failed to refresh setup job counts") + return false + } + totalSetupJobCount := 0 + for _, operator := range operators { + setupJobCount := c.getSetupJobCount(operator.Name) + totalSetupJobCount += setupJobCount + if setupJobCount == 0 { + err = c.ic.UpdateClusterOperator( + ctx, + c.ClusterID, + operator.Name, + operator.Version, + models.OperatorStatusAvailable, + "Setup jobs completed", + ) + if err != nil { + c.log.WithError(err).Error("Failed to update operator") + return false + } + c.log.WithFields(logrus.Fields{ + "name": operator.Name, + }).Info("Operator is available because setup jobs are completed") + } + } + return totalSetupJobCount == 0 + } + return utils.WaitForeverForPredicateWithCancel( + ctx, + GeneralProgressUpdateInt, + areAllSetupJobsComplete, + c.isStageTimedOut(models.FinalizingStageWaitingForOLMOperatorSetupJobs), + ) +} + +// refreshSetupJobsCounts updatest the internal data structure that contains the counts of setup jobs. +func (c *controller) refreshSetupJobCounts() error { + c.setupJobCountsLock.Lock() + defer c.setupJobCountsLock.Unlock() + for k := range c.setupJobCounts { + delete(c.setupJobCounts, k) + } + list, err := c.kc.ListJobs("", metav1.ListOptions{ + LabelSelector: agentInstallSetupJobLabel, + }) + if err != nil { + return err + } + for i := range list.Items { + item := &list.Items[i] + name := item.Labels[agentInstallSetupJobLabel] + count := c.setupJobCounts[name] + var completed bool + completed, err = c.checkSetupJob(item) + if err != nil { + return err + } + if !completed { + count++ + } + c.setupJobCounts[name] = count + } + names := make([]string, len(c.setupJobCounts)) + i := 0 + for name := range c.setupJobCounts { + names[i] = name + } + sort.Strings(names) + for _, name := range names { + c.log.WithFields(logrus.Fields{ + "name": name, + "count": c.setupJobCounts[name], + }).Info("Setup jobs") + } + return nil +} + +// getSetupJobCount returns the number of setup jobs for the given operator. +func (c *controller) getSetupJobCount(name string) int { + c.setupJobCountsLock.Lock() + defer c.setupJobCountsLock.Unlock() + return c.setupJobCounts[name] +} + +// checkSetupJob checks if the given setup job has been completed. +func (c *controller) checkSetupJob(job *batchV1.Job) (result bool, err error) { + result = job.Status.Succeeded > 0 + return +} diff --git a/src/assisted_installer_controller/assisted_installer_controller_test.go b/src/assisted_installer_controller/assisted_installer_controller_test.go index 207631d1ec..165dbe6f37 100644 --- a/src/assisted_installer_controller/assisted_installer_controller_test.go +++ b/src/assisted_installer_controller/assisted_installer_controller_test.go @@ -1075,9 +1075,16 @@ var _ = Describe("installer HostRoleMaster role", func() { logClusterOperatorsSuccess() }) + mockNoSetupJobs := func() { + mockk8sclient.EXPECT().ListJobs("", metav1.ListOptions{ + LabelSelector: agentInstallSetupJobLabel, + }).Return(&batchV1.JobList{}, nil).AnyTimes() + } + It("waiting for single OLM operator", func() { mockAllCapabilitiesEnabled() mockGetClusterForCancel() + mockNoSetupJobs() By("setup", func() { setControllerWaitForOLMOperators(assistedController.ClusterID) @@ -1140,6 +1147,7 @@ var _ = Describe("installer HostRoleMaster role", func() { It("waiting for single OLM operator which timeouts", func() { mockAllCapabilitiesEnabled() mockGetClusterForCancel() + mockNoSetupJobs() By("setup", func() { setControllerWaitForOLMOperators(assistedController.ClusterID) @@ -1178,6 +1186,7 @@ var _ = Describe("installer HostRoleMaster role", func() { It("waiting for single OLM operator which continues installation after timeout occurs", func() { mockAllCapabilitiesEnabled() + mockNoSetupJobs() By("setup", func() { setControllerWaitForOLMOperators(assistedController.ClusterID) @@ -1232,6 +1241,175 @@ var _ = Describe("installer HostRoleMaster role", func() { Expect(assistedController.status.HasError()).Should(Equal(false)) Expect(assistedController.status.GetOperatorsInError()).To(ContainElement("lso")) }) + + It("Waiting for setup job", func() { + mockAllCapabilitiesEnabled() + mockGetClusterForCancel() + + operator := models.MonitoredOperator{ + SubscriptionName: "local-storage-operator", + Namespace: "openshift-local-storage", + OperatorType: models.OperatorTypeOlm, + Name: "lso", + TimeoutSeconds: 120 * 60, + } + + By("Setup", func() { + setControllerWaitForOLMOperators(assistedController.ClusterID) + mockGetOLMOperators([]models.MonitoredOperator{operator}) + mockApplyPostInstallManifests([]models.MonitoredOperator{operator}) + mockk8sclient.EXPECT().GetCSVFromSubscription( + operator.Namespace, + operator.SubscriptionName, + ).Return("local-storage-operator", nil).Times(2) + mockk8sclient.EXPECT().GetCSV( + operator.Namespace, + operator.SubscriptionName, + ).Return( + &olmv1alpha1.ClusterServiceVersion{ + Status: olmv1alpha1.ClusterServiceVersionStatus{ + Phase: olmv1alpha1.CSVPhaseNone, + }, + }, + nil, + ).Times(2) + + mockk8sclient.EXPECT().ListJobs( + "", + metav1.ListOptions{ + LabelSelector: agentInstallSetupJobLabel, + }, + ).Return( + &batchV1.JobList{ + Items: []batchV1.Job{{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + agentInstallSetupJobLabel: operator.Name, + }, + }, + Status: batchV1.JobStatus{ + Succeeded: 0, + }, + }}, + }, + nil, + ).Times(1) + }) + + By("Empty status", func() { + operator.Status = "" + mockGetServiceOperators([]models.MonitoredOperator{operator}) + mockGetCSV( + operator, + &olmv1alpha1.ClusterServiceVersion{ + Status: olmv1alpha1.ClusterServiceVersionStatus{ + Phase: olmv1alpha1.CSVPhaseInstalling, + }, + }, + ) + }) + + By("In progress", func() { + operator.Status = models.OperatorStatusProgressing + mockGetServiceOperators([]models.MonitoredOperator{operator}) + mockGetCSV( + operator, + &olmv1alpha1.ClusterServiceVersion{ + Status: olmv1alpha1.ClusterServiceVersionStatus{ + Phase: olmv1alpha1.CSVPhaseInstalling, + }, + }, + ) + mockbmclient.EXPECT().UpdateClusterOperator( + gomock.Any(), + "cluster-id", + "lso", + gomock.Any(), + models.OperatorStatusProgressing, + gomock.Any(), + ).Return(nil).Times(1) + }) + + By("Available but with setup jobs", func() { + operator.Status = models.OperatorStatusProgressing + mockGetServiceOperators([]models.MonitoredOperator{operator}) + mockGetCSV( + operator, + &olmv1alpha1.ClusterServiceVersion{ + Status: olmv1alpha1.ClusterServiceVersionStatus{ + Phase: olmv1alpha1.CSVPhaseSucceeded, + }, + }, + ) + mockbmclient.EXPECT().UpdateClusterOperator( + gomock.Any(), + "cluster-id", + "lso", + gomock.Any(), + models.OperatorStatusProgressing, + gomock.Any(), + ).Return(nil).Times(1) + }) + + By("Available and no setup jobs", func() { + operator.Status = models.OperatorStatusProgressing + mockGetOLMOperators([]models.MonitoredOperator{operator}) + mockk8sclient.EXPECT().ListJobs( + "", + metav1.ListOptions{ + LabelSelector: agentInstallSetupJobLabel, + }, + ).Return( + &batchV1.JobList{ + Items: []batchV1.Job{{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + agentInstallSetupJobLabel: "lso", + }, + }, + Status: batchV1.JobStatus{ + Succeeded: 1, + }, + }}, + }, + nil, + ).Times(1) + mockbmclient.EXPECT().UpdateClusterOperator( + gomock.Any(), + "cluster-id", + "lso", + gomock.Any(), + models.OperatorStatusAvailable, + gomock.Any(), + ).Return(nil).Times(1) + }) + + mockbmclient.EXPECT().CompleteInstallation( + gomock.Any(), + "cluster-id", + true, + "", + nil, + ).Return(fmt.Errorf("dummy")).Times(1) + mockbmclient.EXPECT().CompleteInstallation(gomock.Any(), "cluster-id", true, "", nil).Return(nil).Times(1) + mockSuccessUpdateFinalizingStages( + models.FinalizingStageWaitingForClusterOperators, + models.FinalizingStageAddingRouterCa, + models.FinalizingStageWaitingForOlmOperatorsCsvInitialization, + models.FinalizingStageApplyingOlmManifests, + models.FinalizingStageWaitingForOlmOperatorsCsv, + models.FinalizingStageWaitingForOLMOperatorSetupJobs, + models.FinalizingStageDone, + ) + + wg.Add(1) + mockk8sclient.EXPECT().ListJobs(olmNamespace, metav1.ListOptions{}).Return(&batchV1.JobList{}, nil).AnyTimes() + mockk8sclient.EXPECT().GetAllInstallPlansOfSubscription(gomock.Any()).Return([]olmv1alpha1.InstallPlan{}, nil).AnyTimes() + assistedController.PostInstallConfigs(context.TODO(), &wg) + wg.Wait() + Expect(assistedController.status.HasError()).Should(Equal(false)) + Expect(assistedController.status.HasOperatorError()).Should(Equal(false)) + }) }) Context("Patching node labels", func() { diff --git a/src/assisted_installer_controller/operator_handler.go b/src/assisted_installer_controller/operator_handler.go index 777e64ce3d..c36e7bb0c3 100644 --- a/src/assisted_installer_controller/operator_handler.go +++ b/src/assisted_installer_controller/operator_handler.go @@ -49,7 +49,22 @@ func (c *controller) isOperatorAvailable(handler OperatorHandler) bool { return false } - err = c.ic.UpdateClusterOperator(context.TODO(), c.ClusterID, operatorName, operatorVersion, operatorStatus, operatorMessage) + // If the operator is available but has setup jobs then we don't want to mark it as available here + // because if we do that for all operators then the service will consider that the cluster is ready + // and that will in turn stop the controller without first checking if the setup jobs have finished. + // So instead we mark the operator as progressing, and will mark it as available later, when we check + // the setup jobs. + var reportedStatus models.OperatorStatus + if operatorStatus == models.OperatorStatusAvailable && c.getSetupJobCount(operatorName) > 0 { + c.log.WithFields(logrus.Fields{ + "name": operatorName, + }).Info("Operator is available but has setup jobs", operatorName) + reportedStatus = models.OperatorStatusProgressing + } else { + reportedStatus = operatorStatus + } + + err = c.ic.UpdateClusterOperator(context.TODO(), c.ClusterID, operatorName, operatorVersion, reportedStatus, operatorMessage) if err != nil { c.log.WithError(err).Warnf("Failed to update %s operator status %s with message %s", operatorName, operatorStatus, operatorMessage) return false