Skip to content

Commit

Permalink
(feat) introduce maxConsecutiveFailures knob
Browse files Browse the repository at this point in the history
The maxConsecutiveFailures option allows control over deployment retry behavior.
This optional field defines a threshold for consecutive deployment failures.
After the specified number of consecutive failures, Sveltos will stop retrying
the deployment.
Retries will only resume if the profile configuration is updated.

If maxConsecutiveFailures is not configured, Sveltos will retry indefinitely
  • Loading branch information
gianlucam76 committed Feb 13, 2025
1 parent 60c1594 commit abb5705
Show file tree
Hide file tree
Showing 15 changed files with 216 additions and 24 deletions.
1 change: 1 addition & 0 deletions api/v1alpha1/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions api/v1beta1/clustersummary_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ type FeatureSummary struct {
// +optional
Hash []byte `json:"hash,omitempty"`

// The maximum number of consecutive deployment failures that Sveltos will permit.
// After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
// This field is optional. If not set, Sveltos default behavior is to keep retrying.
ConsecutiveFailures uint `json:"consecutiveFailures"`

// Status represents the state of the feature in the workload cluster
// +optional
Status FeatureStatus `json:"status,omitempty"`
Expand Down
7 changes: 7 additions & 0 deletions api/v1beta1/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,13 @@ type Spec struct {
// +optional
DriftExclusions []DriftExclusion `json:"driftExclusions,omitempty"`

// The maximum number of consecutive deployment failures that Sveltos will permit.
// After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
// This setting applies only to feature deployments, not resource removal.
// This field is optional. If not set, Sveltos default behavior is to keep retrying.
// +optional
MaxConsecutiveFailures *uint `json:"maxConsecutiveFailures,omitempty"`

// ExtraLabels: These labels will be added by Sveltos to all Kubernetes resources deployed in
// a managed cluster based on this ClusterProfile/Profile instance.
// **Important:** If a resource deployed by Sveltos already has a label with a key present in
Expand Down
5 changes: 5 additions & 0 deletions api/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -1638,6 +1638,13 @@ spec:
- namespace
type: object
type: array
maxConsecutiveFailures:
description: |-
The maximum number of consecutive deployment failures that Sveltos will permit.
After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
This setting applies only to feature deployments, not resource removal.
This field is optional. If not set, Sveltos default behavior is to keep retrying.
type: integer
maxUpdate:
anyOf:
- type: integer
Expand Down
14 changes: 14 additions & 0 deletions config/crd/bases/config.projectsveltos.io_clustersummaries.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1666,6 +1666,13 @@ spec:
- namespace
type: object
type: array
maxConsecutiveFailures:
description: |-
The maximum number of consecutive deployment failures that Sveltos will permit.
After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
This setting applies only to feature deployments, not resource removal.
This field is optional. If not set, Sveltos default behavior is to keep retrying.
type: integer
maxUpdate:
anyOf:
- type: integer
Expand Down Expand Up @@ -2069,6 +2076,12 @@ spec:
FeatureSummary contains a summary of the state of a workload
cluster feature.
properties:
consecutiveFailures:
description: |-
The maximum number of consecutive deployment failures that Sveltos will permit.
After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
This field is optional. If not set, Sveltos default behavior is to keep retrying.
type: integer
deployedGroupVersionKind:
description: |-
DeployedGroupVersionKind contains all GroupVersionKinds deployed in either
Expand Down Expand Up @@ -2116,6 +2129,7 @@ spec:
- Removed
type: string
required:
- consecutiveFailures
- featureID
type: object
type: array
Expand Down
7 changes: 7 additions & 0 deletions config/crd/bases/config.projectsveltos.io_profiles.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1638,6 +1638,13 @@ spec:
- namespace
type: object
type: array
maxConsecutiveFailures:
description: |-
The maximum number of consecutive deployment failures that Sveltos will permit.
After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
This setting applies only to feature deployments, not resource removal.
This field is optional. If not set, Sveltos default behavior is to keep retrying.
type: integer
maxUpdate:
anyOf:
- type: integer
Expand Down
6 changes: 3 additions & 3 deletions controllers/clustersummary_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1285,13 +1285,13 @@ func (r *ClusterSummaryReconciler) setFailureMessage(clusterSummaryScope *scope.

func (r *ClusterSummaryReconciler) resetFeatureStatus(clusterSummaryScope *scope.ClusterSummaryScope, status configv1beta1.FeatureStatus) {
if clusterSummaryScope.ClusterSummary.Spec.ClusterProfileSpec.HelmCharts != nil {
clusterSummaryScope.SetFeatureStatus(configv1beta1.FeatureHelm, status, nil)
clusterSummaryScope.SetFeatureStatus(configv1beta1.FeatureHelm, status, nil, nil)
}
if clusterSummaryScope.ClusterSummary.Spec.ClusterProfileSpec.PolicyRefs != nil {
clusterSummaryScope.SetFeatureStatus(configv1beta1.FeatureResources, status, nil)
clusterSummaryScope.SetFeatureStatus(configv1beta1.FeatureResources, status, nil, nil)
}
if clusterSummaryScope.ClusterSummary.Spec.ClusterProfileSpec.KustomizationRefs != nil {
clusterSummaryScope.SetFeatureStatus(configv1beta1.FeatureKustomize, status, nil)
clusterSummaryScope.SetFeatureStatus(configv1beta1.FeatureKustomize, status, nil, nil)
}
}

Expand Down
58 changes: 53 additions & 5 deletions controllers/clustersummary_deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,22 @@ func (r *ClusterSummaryReconciler) deployFeature(ctx context.Context, clusterSum
if !isConfigSame {
logger.V(logs.LogDebug).Info(fmt.Sprintf("configuration has changed. Current hash %x. Previous hash %x",
currentHash, hash))
clusterSummaryScope.ResetConsecutiveFailures(f.id)
}

if !r.shouldRedeploy(clusterSummaryScope, f, isConfigSame, logger) {
logger.V(logs.LogDebug).Info("no need to redeploy")
return nil
}

return r.proceedDeployingFeature(ctx, clusterSummaryScope, f, isConfigSame, currentHash, logger)
}

func (r *ClusterSummaryReconciler) proceedDeployingFeature(ctx context.Context, clusterSummaryScope *scope.ClusterSummaryScope,
f feature, isConfigSame bool, currentHash []byte, logger logr.Logger) error {

clusterSummary := clusterSummaryScope.ClusterSummary

var status *configv1beta1.FeatureStatus
var resultError error

Expand All @@ -135,6 +144,12 @@ func (r *ClusterSummaryReconciler) deployFeature(ctx context.Context, clusterSum
r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, resultError, logger)
return nil
}
if r.maxNumberOfConsecutiveFailureReached(clusterSummaryScope, f, logger) {
nonRetriableStatus := configv1beta1.FeatureStatusFailedNonRetriable
resultError := errors.New("the maximum number of consecutive errors has been reached")
r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, resultError, logger)
return nil
}
}
if *status == configv1beta1.FeatureStatusProvisioning {
return fmt.Errorf("feature is still being provisioned")
Expand Down Expand Up @@ -355,6 +370,20 @@ func (r *ClusterSummaryReconciler) getHash(clusterSummaryScope *scope.ClusterSum
return nil
}

// getConsecutiveFailures returns, if available, the number of consecutive failures corresponding to the
// featureID
func (r *ClusterSummaryReconciler) getConsecutiveFailures(clusterSummaryScope *scope.ClusterSummaryScope,
featureID configv1beta1.FeatureID) uint {

clusterSummary := clusterSummaryScope.ClusterSummary

if fs := getFeatureSummaryForFeatureID(clusterSummary, featureID); fs != nil {
return fs.ConsecutiveFailures
}

return 0
}

func (r *ClusterSummaryReconciler) updateFeatureStatus(clusterSummaryScope *scope.ClusterSummaryScope,
featureID configv1beta1.FeatureID, status *configv1beta1.FeatureStatus, hash []byte, statusError error,
logger logr.Logger) {
Expand All @@ -368,17 +397,20 @@ func (r *ClusterSummaryReconciler) updateFeatureStatus(clusterSummaryScope *scop

switch *status {
case configv1beta1.FeatureStatusProvisioned:
clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusProvisioned, hash)
failed := false
clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusProvisioned, hash, &failed)
clusterSummaryScope.SetFailureMessage(featureID, nil)
case configv1beta1.FeatureStatusRemoved:
clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusRemoved, hash)
failed := false
clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusRemoved, hash, &failed)
clusterSummaryScope.SetFailureMessage(featureID, nil)
case configv1beta1.FeatureStatusProvisioning:
clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusProvisioning, hash)
clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusProvisioning, hash, nil)
case configv1beta1.FeatureStatusRemoving:
clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusRemoving, hash)
clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusRemoving, hash, nil)
case configv1beta1.FeatureStatusFailed, configv1beta1.FeatureStatusFailedNonRetriable:
clusterSummaryScope.SetFeatureStatus(featureID, *status, hash)
failed := true
clusterSummaryScope.SetFeatureStatus(featureID, *status, hash, &failed)
err := statusError.Error()
clusterSummaryScope.SetFailureMessage(featureID, &err)
}
Expand Down Expand Up @@ -436,3 +468,19 @@ func (r *ClusterSummaryReconciler) shouldRedeploy(clusterSummaryScope *scope.Clu

return true
}

// maxNumberOfConsecutiveFailureReached returns true if max number of consecutive failures has been reached.
func (r *ClusterSummaryReconciler) maxNumberOfConsecutiveFailureReached(clusterSummaryScope *scope.ClusterSummaryScope, f feature,
logger logr.Logger) bool {

if clusterSummaryScope.ClusterSummary.Spec.ClusterProfileSpec.MaxConsecutiveFailures != nil {
consecutiveFailures := r.getConsecutiveFailures(clusterSummaryScope, f.id)
if consecutiveFailures >= *clusterSummaryScope.ClusterSummary.Spec.ClusterProfileSpec.MaxConsecutiveFailures {
msg := fmt.Sprintf("max number of consecutive failures reached %d", consecutiveFailures)
logger.V(logs.LogDebug).Info(msg)
return true
}
}

return false
}
2 changes: 1 addition & 1 deletion controllers/conflicts.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ func requeueClusterSummary(ctx context.Context, featureID configv1beta1.FeatureI
// Reset the hash a deployment happens again
logger.V(logs.LogDebug).Info(fmt.Sprintf("reset status of ClusterSummary %s/%s",
clusterSummary.Namespace, clusterSummary.Name))
clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusProvisioning, nil)
clusterSummaryScope.SetFeatureStatus(featureID, configv1beta1.FeatureStatusProvisioning, nil, nil)

return c.Status().Update(ctx, clusterSummaryScope.ClusterSummary)
}
4 changes: 2 additions & 2 deletions controllers/handlers_helm.go
Original file line number Diff line number Diff line change
Expand Up @@ -1343,8 +1343,8 @@ func newRegistryClientWithTLS(certFile, keyFile, caFile string, insecureSkipTLSv
return registryClient, nil
}

func actionConfigInit(namespace, kubeconfig string, registryOptions *registryClientOptions, enableClientCache bool,
) (*action.Configuration, error) {
func actionConfigInit(namespace, kubeconfig string, registryOptions *registryClientOptions,
enableClientCache bool) (*action.Configuration, error) {

actionConfig := new(action.Configuration)

Expand Down
28 changes: 28 additions & 0 deletions manifest/manifest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2419,6 +2419,13 @@ spec:
- namespace
type: object
type: array
maxConsecutiveFailures:
description: |-
The maximum number of consecutive deployment failures that Sveltos will permit.
After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
This setting applies only to feature deployments, not resource removal.
This field is optional. If not set, Sveltos default behavior is to keep retrying.
type: integer
maxUpdate:
anyOf:
- type: integer
Expand Down Expand Up @@ -5269,6 +5276,13 @@ spec:
- namespace
type: object
type: array
maxConsecutiveFailures:
description: |-
The maximum number of consecutive deployment failures that Sveltos will permit.
After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
This setting applies only to feature deployments, not resource removal.
This field is optional. If not set, Sveltos default behavior is to keep retrying.
type: integer
maxUpdate:
anyOf:
- type: integer
Expand Down Expand Up @@ -5672,6 +5686,12 @@ spec:
FeatureSummary contains a summary of the state of a workload
cluster feature.
properties:
consecutiveFailures:
description: |-
The maximum number of consecutive deployment failures that Sveltos will permit.
After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
This field is optional. If not set, Sveltos default behavior is to keep retrying.
type: integer
deployedGroupVersionKind:
description: |-
DeployedGroupVersionKind contains all GroupVersionKinds deployed in either
Expand Down Expand Up @@ -5719,6 +5739,7 @@ spec:
- Removed
type: string
required:
- consecutiveFailures
- featureID
type: object
type: array
Expand Down Expand Up @@ -7422,6 +7443,13 @@ spec:
- namespace
type: object
type: array
maxConsecutiveFailures:
description: |-
The maximum number of consecutive deployment failures that Sveltos will permit.
After this many consecutive failures, the deployment will be considered failed, and Sveltos will stop retrying.
This setting applies only to feature deployments, not resource removal.
This field is optional. If not set, Sveltos default behavior is to keep retrying.
type: integer
maxUpdate:
anyOf:
- type: integer
Expand Down
31 changes: 27 additions & 4 deletions pkg/scope/clustersummary.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,28 +102,51 @@ func (s *ClusterSummaryScope) initializeFeatureStatusSummary() {

// SetFeatureStatus sets the feature status.
func (s *ClusterSummaryScope) SetFeatureStatus(featureID configv1beta1.FeatureID,
status configv1beta1.FeatureStatus, hash []byte) {
status configv1beta1.FeatureStatus, hash []byte, failed *bool) {

for i := range s.ClusterSummary.Status.FeatureSummaries {
if s.ClusterSummary.Status.FeatureSummaries[i].FeatureID == featureID {
s.ClusterSummary.Status.FeatureSummaries[i].Status = status
s.ClusterSummary.Status.FeatureSummaries[i].Hash = hash
if failed != nil {
if *failed {
s.ClusterSummary.Status.FeatureSummaries[i].ConsecutiveFailures++
} else {
s.ClusterSummary.Status.FeatureSummaries[i].ConsecutiveFailures = 0
}
}
return
}
}

s.initializeFeatureStatusSummary()

consecutiveFailures := uint(0)
if failed != nil && *failed {
consecutiveFailures = 1
}

s.ClusterSummary.Status.FeatureSummaries = append(
s.ClusterSummary.Status.FeatureSummaries,
configv1beta1.FeatureSummary{
FeatureID: featureID,
Status: status,
Hash: hash,
FeatureID: featureID,
Status: status,
Hash: hash,
ConsecutiveFailures: consecutiveFailures,
},
)
}

// ResetConsecutiveFailures reset status consecutiveFailures
func (s *ClusterSummaryScope) ResetConsecutiveFailures(featureID configv1beta1.FeatureID) {
for i := range s.ClusterSummary.Status.FeatureSummaries {
if s.ClusterSummary.Status.FeatureSummaries[i].FeatureID == featureID {
s.ClusterSummary.Status.FeatureSummaries[i].ConsecutiveFailures = 0
return
}
}
}

// SetDependenciesMessage sets the dependencies status.
func (s *ClusterSummaryScope) SetDependenciesMessage(message *string) {
s.ClusterSummary.Status.Dependencies = message
Expand Down
Loading

0 comments on commit abb5705

Please sign in to comment.