Skip to content

Commit

Permalink
Fix ProgressivePerGroup handling
Browse files Browse the repository at this point in the history
- MandatoryDecisionGroups should tolerate no failures
- The logic for ProgressivePerGroup was incorrect

Signed-off-by: Dale Haiducek <[email protected]>
  • Loading branch information
dhaiducek committed Nov 6, 2023
1 parent 00fc219 commit de4a95b
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 121 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,10 @@ spec:
and ProgressivePerGroup. For Progressive, this
is considered over the total number of clusters.
For ProgressivePerGroup, this is considered according
to the size of the current group. Default is that
no failures are tolerated.
to the size of the current group. For both Progressive
and ProgressivePerGroup, the MaxFailures does
not apply for MandatoryDecisionGroups, which tolerate
no failures. Default is that no failures are tolerated.
pattern: ^((100|[0-9]{1,2})%|[0-9]+)$
x-kubernetes-int-or-string: true
minSuccessTime:
Expand Down Expand Up @@ -255,8 +257,10 @@ spec:
and ProgressivePerGroup. For Progressive, this
is considered over the total number of clusters.
For ProgressivePerGroup, this is considered according
to the size of the current group. Default is that
no failures are tolerated.
to the size of the current group. For both Progressive
and ProgressivePerGroup, the MaxFailures does
not apply for MandatoryDecisionGroups, which tolerate
no failures. Default is that no failures are tolerated.
pattern: ^((100|[0-9]{1,2})%|[0-9]+)$
x-kubernetes-int-or-string: true
minSuccessTime:
Expand Down Expand Up @@ -339,8 +343,10 @@ spec:
and ProgressivePerGroup. For Progressive, this
is considered over the total number of clusters.
For ProgressivePerGroup, this is considered according
to the size of the current group. Default is that
no failures are tolerated.
to the size of the current group. For both Progressive
and ProgressivePerGroup, the MaxFailures does
not apply for MandatoryDecisionGroups, which tolerate
no failures. Default is that no failures are tolerated.
pattern: ^((100|[0-9]{1,2})%|[0-9]+)$
x-kubernetes-int-or-string: true
minSuccessTime:
Expand Down Expand Up @@ -385,20 +391,6 @@ spec:
type: object
type:
default: All
description: Rollout strategy Types are All, Progressive
and ProgressivePerGroup 1) All means apply the workload
to all clusters in the decision groups at once. 2)
Progressive means apply the workload to the selected
clusters progressively per cluster. The workload will
not be applied to the next cluster unless one of the
current applied clusters reach the successful state
and haven't breached the MaxFailures configuration.
3) ProgressivePerGroup means apply the workload to
decisionGroup clusters progressively per group. The
workload will not be applied to the next decisionGroup
unless all clusters in the current group reach the
successful state and haven't breached the MaxFailures
configuration.
enum:
- All
- Progressive
Expand Down
70 changes: 32 additions & 38 deletions cluster/v1alpha1/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,10 @@ func (r *RolloutHandler[T]) getProgressiveClusters(rolloutStrategy RolloutStrate
groupKeys := decisionGroupsToGroupKeys(strategy.Progressive.MandatoryDecisionGroups.MandatoryDecisionGroups)
clusterGroups = r.pdTracker.ExistingClusterGroups(groupKeys...)

// Perform progressive rollOut for mandatory decision groups first.
// Perform progressive rollOut for mandatory decision groups first, tolerating no failures
if len(clusterGroups) > 0 {
rolloutResult := progressivePerGroup(
clusterGroups, intstr.FromInt(maxFailures), minSuccessTime, failureTimeout, currentClusterStatus, true,
clusterGroups, intstr.FromInt(0), minSuccessTime, failureTimeout, currentClusterStatus,
)
if len(rolloutResult.ClustersToRollout) > 0 || len(rolloutResult.ClustersTimeOut) > 0 {
rolloutResult.ClustersRemoved = removedClusterStatus
Expand Down Expand Up @@ -219,9 +219,9 @@ func (r *RolloutHandler[T]) getProgressivePerGroupClusters(rolloutStrategy Rollo
groupKeys := decisionGroupsToGroupKeys(mandatoryDecisionGroups)
clusterGroups = r.pdTracker.ExistingClusterGroups(groupKeys...)

// Perform progressive rollout per group for mandatory decision groups first
// Perform progressive rollout per group for mandatory decision groups first, tolerating no failures
if len(clusterGroups) > 0 {
rolloutResult := progressivePerGroup(clusterGroups, maxFailures, minSuccessTime, failureTimeout, currentClusterStatus, false)
rolloutResult := progressivePerGroup(clusterGroups, intstr.FromInt(0), minSuccessTime, failureTimeout, currentClusterStatus)

if len(rolloutResult.ClustersToRollout) > 0 || len(rolloutResult.ClustersTimeOut) > 0 {
rolloutResult.ClustersRemoved = removedClusterStatus
Expand All @@ -233,7 +233,7 @@ func (r *RolloutHandler[T]) getProgressivePerGroupClusters(rolloutStrategy Rollo
restClusterGroups := r.pdTracker.ExistingClusterGroupsBesides(groupKeys...)

// Perform progressive rollout per group for the remaining decision groups
rolloutResult := progressivePerGroup(restClusterGroups, maxFailures, minSuccessTime, failureTimeout, currentClusterStatus, false)
rolloutResult := progressivePerGroup(restClusterGroups, maxFailures, minSuccessTime, failureTimeout, currentClusterStatus)
rolloutResult.ClustersRemoved = removedClusterStatus

return &strategy, rolloutResult, nil
Expand Down Expand Up @@ -291,7 +291,7 @@ func progressivePerCluster(

// If there was a breach of MaxFailures, only handle clusters that have already had workload applied
if !failureBreach || failureBreach && status.Status != ToApply {
rolloutClusters, timeoutClusters = determineRolloutStatus(status, minSuccessTime, timeout, rolloutClusters, timeoutClusters)
rolloutClusters, timeoutClusters = determineRolloutStatus(&status, minSuccessTime, timeout, rolloutClusters, timeoutClusters)
}

// Keep track of TimeOut or Failed clusters and check total against MaxFailures
Expand Down Expand Up @@ -360,44 +360,43 @@ func progressivePerGroup(
minSuccessTime time.Duration,
timeout time.Duration,
existingClusterStatus []ClusterRolloutStatus,
accumulateFailures bool,
) RolloutResult {
var rolloutClusters, timeoutClusters []ClusterRolloutStatus
existingClusters := make(map[string]bool)
existingClusters := make(map[string]RolloutStatus)

for _, status := range existingClusterStatus {
if status.ClusterName == "" {
continue
}

if status.Status == ToApply {
// Set as false to consider the cluster in the decisionGroups iteration.
existingClusters[status.ClusterName] = false
} else {
existingClusters[status.ClusterName] = true
rolloutClusters, timeoutClusters = determineRolloutStatus(status, minSuccessTime, timeout, rolloutClusters, timeoutClusters)
// ToApply will be reconsidered in the decisionGroups iteration.
if status.Status != ToApply {
rolloutClusters, timeoutClusters = determineRolloutStatus(&status, minSuccessTime, timeout, rolloutClusters, timeoutClusters)
existingClusters[status.ClusterName] = status.Status
}
}

var failureCount int
totalFailureCount := 0
failureBreach := false
clusterGroupKeys := clusterGroupsMap.GetOrderedGroupKeys()
for _, key := range clusterGroupKeys {
groupFailureCount := 0
if subclusters, ok := clusterGroupsMap[key]; ok {
// Only reset the failure count for ProgressivePerGroup
// since Progressive is over the total number of clusters
if !accumulateFailures {
failureCount = 0
}
failureBreach = false
// Calculate the max failure threshold for the group--the returned error was checked
// previously, so it's ignored here
maxGroupFailures, _ := calculateRolloutSize(maxFailures, subclusters.Len(), 0)
maxGroupFailures, _ := calculateRolloutSize(maxFailures, len(subclusters), 0)
// Iterate through clusters in the group
clusters := subclusters.UnsortedList()
sort.Strings(clusters)
for _, cluster := range clusters {
if existingClusters[cluster] {
if status, ok := existingClusters[cluster]; ok {
// Keep track of TimeOut or Failed clusters and check total against MaxFailures
if status == TimeOut || status == Failed {
groupFailureCount++

failureBreach = groupFailureCount > maxGroupFailures
}

continue
}

Expand All @@ -407,18 +406,13 @@ func progressivePerGroup(
GroupKey: key,
}
rolloutClusters = append(rolloutClusters, status)

// Keep track of TimeOut or Failed clusters and check total against MaxFailures
if status.Status == TimeOut || status.Status == Failed {
failureCount++

failureBreach = failureCount > maxGroupFailures
}
}

// As it is perGroup Return if there are clusters to rollOut,
// or there was a breach of the MaxFailure configuration
if len(rolloutClusters) > maxGroupFailures || failureBreach {
totalFailureCount += groupFailureCount

// As it is perGroup, return if there are clusters to rollOut that aren't
// Failed/Timeout, or there was a breach of the MaxFailure configuration
if len(rolloutClusters)-totalFailureCount > 0 || failureBreach {
return RolloutResult{
ClustersToRollout: rolloutClusters,
ClustersTimeOut: timeoutClusters,
Expand Down Expand Up @@ -448,7 +442,7 @@ func progressivePerGroup(
// the rollOut Clusters.
// 2. If timeout is set to 0, the function append the clusterStatus to the timeOut clusters.
func determineRolloutStatus(
status ClusterRolloutStatus,
status *ClusterRolloutStatus,
minSuccessTime time.Duration,
timeout time.Duration,
rolloutClusters []ClusterRolloutStatus,
Expand All @@ -457,13 +451,13 @@ func determineRolloutStatus(

switch status.Status {
case ToApply:
rolloutClusters = append(rolloutClusters, status)
rolloutClusters = append(rolloutClusters, *status)
case Succeeded:
// If the cluster succeeded but is still within the MinSuccessTime (i.e. "soak" time),
// still add it to the list of rolloutClusters
minSuccessTimeTime := getTimeOutTime(status.LastTransitionTime, minSuccessTime)
if RolloutClock.Now().Before(minSuccessTimeTime.Time) {
rolloutClusters = append(rolloutClusters, status)
rolloutClusters = append(rolloutClusters, *status)
}

return rolloutClusters, timeoutClusters
Expand All @@ -474,10 +468,10 @@ func determineRolloutStatus(
status.TimeOutTime = timeOutTime
// check if current time is before the timeout time
if RolloutClock.Now().Before(timeOutTime.Time) {
rolloutClusters = append(rolloutClusters, status)
rolloutClusters = append(rolloutClusters, *status)
} else {
status.Status = TimeOut
timeoutClusters = append(timeoutClusters, status)
timeoutClusters = append(timeoutClusters, *status)
}
}

Expand Down
Loading

0 comments on commit de4a95b

Please sign in to comment.