(feat) introduce maxConsecutiveFailures knob

The maxConsecutiveFailures option allows control over deployment retry behavior. This optional field defines a threshold for consecutive deployment failures. After the specified number of consecutive failures, Sveltos will stop retrying the deployment. Retries will only resume if the profile configuration is updated. If maxConsecutiveFailures is not configured, Sveltos will retry indefinitely
projectsveltos · Feb 13, 2025 · abb5705 · abb5705
1 parent 60c1594
commit abb5705
Show file tree

Hide file tree

Showing 15 changed files with 216 additions and 24 deletions.
diff --git a/api/v1alpha1/zz_generated.conversion.go b/api/v1alpha1/zz_generated.conversion.go
diff --git a/api/v1beta1/clustersummary_types.go b/api/v1beta1/clustersummary_types.go
@@ -91,6 +91,11 @@ type FeatureSummary struct {
 	// +optional
 	Hash []byte `json:"hash,omitempty"`
 
+	// The maximum number of consecutive deployment failures that Sveltos will permit.
+	// After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
+	// This field is optional. If not set, Sveltos default behavior is to keep retrying.
+	ConsecutiveFailures uint `json:"consecutiveFailures"`
+
 	// Status represents the state of the feature in the workload cluster
 	// +optional
 	Status FeatureStatus `json:"status,omitempty"`

diff --git a/api/v1beta1/spec.go b/api/v1beta1/spec.go
@@ -724,6 +724,13 @@ type Spec struct {
 	// +optional
 	DriftExclusions []DriftExclusion `json:"driftExclusions,omitempty"`
 
+	// The maximum number of consecutive deployment failures that Sveltos will permit.
+	// After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
+	// This setting applies only to feature deployments, not resource removal.
+	// This field is optional. If not set, Sveltos default behavior is to keep retrying.
+	// +optional
+	MaxConsecutiveFailures *uint `json:"maxConsecutiveFailures,omitempty"`
+
 	// ExtraLabels: These labels will be added by Sveltos to all Kubernetes resources deployed in
 	// a managed cluster based on this ClusterProfile/Profile instance.
 	// **Important:** If a resource deployed by Sveltos already has a label with a key present in

diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/config.projectsveltos.io_clusterprofiles.yaml b/config/crd/bases/config.projectsveltos.io_clusterprofiles.yaml
@@ -1638,6 +1638,13 @@ spec:
                   - namespace
                   type: object
                 type: array
+              maxConsecutiveFailures:
+                description: |-
+                  The maximum number of consecutive deployment failures that Sveltos will permit.
+                  After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
+                  This setting applies only to feature deployments, not resource removal.
+                  This field is optional. If not set, Sveltos default behavior is to keep retrying.
+                type: integer
               maxUpdate:
                 anyOf:
                 - type: integer

diff --git a/config/crd/bases/config.projectsveltos.io_clustersummaries.yaml b/config/crd/bases/config.projectsveltos.io_clustersummaries.yaml
@@ -1666,6 +1666,13 @@ spec:
                       - namespace
                       type: object
                     type: array
+                  maxConsecutiveFailures:
+                    description: |-
+                      The maximum number of consecutive deployment failures that Sveltos will permit.
+                      After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
+                      This setting applies only to feature deployments, not resource removal.
+                      This field is optional. If not set, Sveltos default behavior is to keep retrying.
+                    type: integer
                   maxUpdate:
                     anyOf:
                     - type: integer
@@ -2069,6 +2076,12 @@ spec:
                     FeatureSummary contains a summary of the state of a workload
                     cluster feature.
                   properties:
+                    consecutiveFailures:
+                      description: |-
+                        The maximum number of consecutive deployment failures that Sveltos will permit.
+                        After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
+                        This field is optional. If not set, Sveltos default behavior is to keep retrying.
+                      type: integer
                     deployedGroupVersionKind:
                       description: |-
                         DeployedGroupVersionKind contains all GroupVersionKinds deployed in either
@@ -2116,6 +2129,7 @@ spec:
                       - Removed
                       type: string
                   required:
+                  - consecutiveFailures
                   - featureID
                   type: object
                 type: array

diff --git a/config/crd/bases/config.projectsveltos.io_profiles.yaml b/config/crd/bases/config.projectsveltos.io_profiles.yaml
@@ -1638,6 +1638,13 @@ spec:
                   - namespace
                   type: object
                 type: array
+              maxConsecutiveFailures:
+                description: |-
+                  The maximum number of consecutive deployment failures that Sveltos will permit.
+                  After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
+                  This setting applies only to feature deployments, not resource removal.
+                  This field is optional. If not set, Sveltos default behavior is to keep retrying.
+                type: integer
               maxUpdate:
                 anyOf:
                 - type: integer

diff --git a/controllers/clustersummary_controller.go b/controllers/clustersummary_controller.go
@@ -1285,13 +1285,13 @@ func (r *ClusterSummaryReconciler) setFailureMessage(clusterSummaryScope *scope.
 
 func (r *ClusterSummaryReconciler) resetFeatureStatus(clusterSummaryScope *scope.ClusterSummaryScope, status configv1beta1.FeatureStatus) {
 	if clusterSummaryScope.ClusterSummary.Spec.ClusterProfileSpec.HelmCharts != nil {
-		clusterSummaryScope.SetFeatureStatus(configv1beta1.FeatureHelm, status, nil)
+		clusterSummaryScope.SetFeatureStatus(configv1beta1.FeatureHelm, status, nil, nil)
 	}
 	if clusterSummaryScope.ClusterSummary.Spec.ClusterProfileSpec.PolicyRefs != nil {
-		clusterSummaryScope.SetFeatureStatus(configv1beta1.FeatureResources, status, nil)
+		clusterSummaryScope.SetFeatureStatus(configv1beta1.FeatureResources, status, nil, nil)
 	}
 	if clusterSummaryScope.ClusterSummary.Spec.ClusterProfileSpec.KustomizationRefs != nil {
-		clusterSummaryScope.SetFeatureStatus(configv1beta1.FeatureKustomize, status, nil)
+		clusterSummaryScope.SetFeatureStatus(configv1beta1.FeatureKustomize, status, nil, nil)
 	}
 }
 

diff --git a/controllers/clustersummary_deployer.go b/controllers/clustersummary_deployer.go
@@ -102,13 +102,22 @@ func (r *ClusterSummaryReconciler) deployFeature(ctx context.Context, clusterSum
 	if !isConfigSame {
 		logger.V(logs.LogDebug).Info(fmt.Sprintf("configuration has changed. Current hash %x. Previous hash %x",
 			currentHash, hash))
+		clusterSummaryScope.ResetConsecutiveFailures(f.id)
 	}
 
 	if !r.shouldRedeploy(clusterSummaryScope, f, isConfigSame, logger) {
 		logger.V(logs.LogDebug).Info("no need to redeploy")
 		return nil
 	}
 
+	return r.proceedDeployingFeature(ctx, clusterSummaryScope, f, isConfigSame, currentHash, logger)
+}
+
+func (r *ClusterSummaryReconciler) proceedDeployingFeature(ctx context.Context, clusterSummaryScope *scope.ClusterSummaryScope,
+	f feature, isConfigSame bool, currentHash []byte, logger logr.Logger) error {
+
+	clusterSummary := clusterSummaryScope.ClusterSummary
+
 	var status *configv1beta1.FeatureStatus
 	var resultError error
 
@@ -135,6 +144,12 @@ func (r *ClusterSummaryReconciler) deployFeature(ctx context.Context, clusterSum
 				r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, resultError, logger)
 				return nil
 			}
+			if r.maxNumberOfConsecutiveFailureReached(clusterSummaryScope, f, logger) {
+				nonRetriableStatus := configv1beta1.FeatureStatusFailedNonRetriable
+				resultError := errors.New("the maximum number of consecutive errors has been reached")
+				r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, resultError, logger)
+				return nil
+			}
 		}
 		if *status == configv1beta1.FeatureStatusProvisioning {
 			return fmt.Errorf("feature is still being provisioned")
@@ -355,6 +370,20 @@ func (r *ClusterSummaryReconciler) getHash(clusterSummaryScope *scope.ClusterSum
 	return nil
 }
 
+// getConsecutiveFailures returns, if available, the number of consecutive failures corresponding to the
+// featureID
+func (r *ClusterSummaryReconciler) getConsecutiveFailures(clusterSummaryScope *scope.ClusterSummaryScope,
+	featureID configv1beta1.FeatureID) uint {
+
+	clusterSummary := clusterSummaryScope.ClusterSummary
+
+	if fs := getFeatureSummaryForFeatureID(clusterSummary, featureID); fs != nil {
+		return fs.ConsecutiveFailures
+	}
+
+	return 0
+}
+
 func (r *ClusterSummaryReconciler) updateFeatureStatus(clusterSummaryScope *scope.ClusterSummaryScope,
 	featureID configv1beta1.FeatureID, status *configv1beta1.FeatureStatus, hash []byte, statusError error,
 	logger logr.Logger) {
@@ -368,17 +397,20 @@ func (r *ClusterSummaryReconciler) updateFeatureStatus(clusterSummaryScope *scop
 
 	switch *status {
 	case configv1beta1.FeatureStatusProvisioned:
-		clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusProvisioned, hash)
+		failed := false
+		clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusProvisioned, hash, &failed)
 		clusterSummaryScope.SetFailureMessage(featureID, nil)
 	case configv1beta1.FeatureStatusRemoved:
-		clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusRemoved, hash)
+		failed := false
+		clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusRemoved, hash, &failed)
 		clusterSummaryScope.SetFailureMessage(featureID, nil)
 	case configv1beta1.FeatureStatusProvisioning:
-		clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusProvisioning, hash)
+		clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusProvisioning, hash, nil)
 	case configv1beta1.FeatureStatusRemoving:
-		clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusRemoving, hash)
+		clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusRemoving, hash, nil)
 	case configv1beta1.FeatureStatusFailed, configv1beta1.FeatureStatusFailedNonRetriable:
-		clusterSummaryScope.SetFeatureStatus(featureID, *status, hash)
+		failed := true
+		clusterSummaryScope.SetFeatureStatus(featureID, *status, hash, &failed)
 		err := statusError.Error()
 		clusterSummaryScope.SetFailureMessage(featureID, &err)
 	}
@@ -436,3 +468,19 @@ func (r *ClusterSummaryReconciler) shouldRedeploy(clusterSummaryScope *scope.Clu
 
 	return true
 }
+
+// maxNumberOfConsecutiveFailureReached returns true if max number of consecutive failures has been reached.
+func (r *ClusterSummaryReconciler) maxNumberOfConsecutiveFailureReached(clusterSummaryScope *scope.ClusterSummaryScope, f feature,
+	logger logr.Logger) bool {
+
+	if clusterSummaryScope.ClusterSummary.Spec.ClusterProfileSpec.MaxConsecutiveFailures != nil {
+		consecutiveFailures := r.getConsecutiveFailures(clusterSummaryScope, f.id)
+		if consecutiveFailures >= *clusterSummaryScope.ClusterSummary.Spec.ClusterProfileSpec.MaxConsecutiveFailures {
+			msg := fmt.Sprintf("max number of consecutive failures reached %d", consecutiveFailures)
+			logger.V(logs.LogDebug).Info(msg)
+			return true
+		}
+	}
+
+	return false
+}
diff --git a/controllers/conflicts.go b/controllers/conflicts.go
@@ -125,7 +125,7 @@ func requeueClusterSummary(ctx context.Context, featureID configv1beta1.FeatureI
 	// Reset the hash a deployment happens again
 	logger.V(logs.LogDebug).Info(fmt.Sprintf("reset status of ClusterSummary %s/%s",
 		clusterSummary.Namespace, clusterSummary.Name))
-	clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusProvisioning, nil)
+	clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusProvisioning, nil, nil)
 
 	return c.Status().Update(ctx, clusterSummaryScope.ClusterSummary)
 }
diff --git a/controllers/handlers_helm.go b/controllers/handlers_helm.go
@@ -1343,8 +1343,8 @@ func newRegistryClientWithTLS(certFile, keyFile, caFile string, insecureSkipTLSv
 	return registryClient, nil
 }
 
-func actionConfigInit(namespace, kubeconfig string, registryOptions *registryClientOptions, enableClientCache bool,
-) (*action.Configuration, error) {
+func actionConfigInit(namespace, kubeconfig string, registryOptions *registryClientOptions,
+	enableClientCache bool) (*action.Configuration, error) {
 
 	actionConfig := new(action.Configuration)
 

diff --git a/manifest/manifest.yaml b/manifest/manifest.yaml
@@ -2419,6 +2419,13 @@ spec:
                   - namespace
                   type: object
                 type: array
+              maxConsecutiveFailures:
+                description: |-
+                  The maximum number of consecutive deployment failures that Sveltos will permit.
+                  After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
+                  This setting applies only to feature deployments, not resource removal.
+                  This field is optional. If not set, Sveltos default behavior is to keep retrying.
+                type: integer
               maxUpdate:
                 anyOf:
                 - type: integer
@@ -5269,6 +5276,13 @@ spec:
                       - namespace
                       type: object
                     type: array
+                  maxConsecutiveFailures:
+                    description: |-
+                      The maximum number of consecutive deployment failures that Sveltos will permit.
+                      After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
+                      This setting applies only to feature deployments, not resource removal.
+                      This field is optional. If not set, Sveltos default behavior is to keep retrying.
+                    type: integer
                   maxUpdate:
                     anyOf:
                     - type: integer
@@ -5672,6 +5686,12 @@ spec:
                     FeatureSummary contains a summary of the state of a workload
                     cluster feature.
                   properties:
+                    consecutiveFailures:
+                      description: |-
+                        The maximum number of consecutive deployment failures that Sveltos will permit.
+                        After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
+                        This field is optional. If not set, Sveltos default behavior is to keep retrying.
+                      type: integer
                     deployedGroupVersionKind:
                       description: |-
                         DeployedGroupVersionKind contains all GroupVersionKinds deployed in either
@@ -5719,6 +5739,7 @@ spec:
                       - Removed
                       type: string
                   required:
+                  - consecutiveFailures
                   - featureID
                   type: object
                 type: array
@@ -7422,6 +7443,13 @@ spec:
                   - namespace
                   type: object
                 type: array
+              maxConsecutiveFailures:
+                description: |-
+                  The maximum number of consecutive deployment failures that Sveltos will permit.
+                  After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
+                  This setting applies only to feature deployments, not resource removal.
+                  This field is optional. If not set, Sveltos default behavior is to keep retrying.
+                type: integer
               maxUpdate:
                 anyOf:
                 - type: integer

diff --git a/pkg/scope/clustersummary.go b/pkg/scope/clustersummary.go
@@ -102,28 +102,51 @@ func (s *ClusterSummaryScope) initializeFeatureStatusSummary() {
 
 // SetFeatureStatus sets the feature status.
 func (s *ClusterSummaryScope) SetFeatureStatus(featureID configv1beta1.FeatureID,
-	status configv1beta1.FeatureStatus, hash []byte) {
+	status configv1beta1.FeatureStatus, hash []byte, failed *bool) {
 
 	for i := range s.ClusterSummary.Status.FeatureSummaries {
 		if s.ClusterSummary.Status.FeatureSummaries[i].FeatureID == featureID {
 			s.ClusterSummary.Status.FeatureSummaries[i].Status = status
 			s.ClusterSummary.Status.FeatureSummaries[i].Hash = hash
+			if failed != nil {
+				if *failed {
+					s.ClusterSummary.Status.FeatureSummaries[i].ConsecutiveFailures++
+				} else {
+					s.ClusterSummary.Status.FeatureSummaries[i].ConsecutiveFailures = 0
+				}
+			}
 			return
 		}
 	}
 
 	s.initializeFeatureStatusSummary()
 
+	consecutiveFailures := uint(0)
+	if failed != nil && *failed {
+		consecutiveFailures = 1
+	}
+
 	s.ClusterSummary.Status.FeatureSummaries = append(
 		s.ClusterSummary.Status.FeatureSummaries,
 		configv1beta1.FeatureSummary{
-			FeatureID: featureID,
-			Status:    status,
-			Hash:      hash,
+			FeatureID:           featureID,
+			Status:              status,
+			Hash:                hash,
+			ConsecutiveFailures: consecutiveFailures,
 		},
 	)
 }
 
+// ResetConsecutiveFailures reset status consecutiveFailures
+func (s *ClusterSummaryScope) ResetConsecutiveFailures(featureID configv1beta1.FeatureID) {
+	for i := range s.ClusterSummary.Status.FeatureSummaries {
+		if s.ClusterSummary.Status.FeatureSummaries[i].FeatureID == featureID {
+			s.ClusterSummary.Status.FeatureSummaries[i].ConsecutiveFailures = 0
+			return
+		}
+	}
+}
+
 // SetDependenciesMessage sets the dependencies status.
 func (s *ClusterSummaryScope) SetDependenciesMessage(message *string) {
 	s.ClusterSummary.Status.Dependencies = message