Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Playbook action retries #1851

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ gen-schemas:

.PHONY: manifests
manifests: controller-gen generate gen-schemas ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
$(CONTROLLER_GEN) crd paths="./api/..." output:crd:artifacts:config=config/crds
$(CONTROLLER_GEN) crd:allowDangerousTypes=true paths="./api/..." output:crd:artifacts:config=config/crds

.PHONY: generate
generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
Expand Down
41 changes: 41 additions & 0 deletions api/v1/playbook_actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
gocontext "context"
"encoding/json"
"fmt"
"math"
"time"

"github.com/flanksource/commons/duration"
Expand All @@ -16,6 +17,7 @@ import (
"github.com/flanksource/duty/types"
"github.com/google/uuid"
"github.com/samber/lo"
"golang.org/x/exp/rand"
"k8s.io/client-go/kubernetes"

"github.com/flanksource/incident-commander/api"
Expand Down Expand Up @@ -509,6 +511,42 @@ func (t *AWSConnection) Populate(ctx connectionContext, k8s kubernetes.Interface
return nil
}

type RetryExponent struct {
Multiplier float64 `json:"multiplier"`
}

type PlaybookActionRetry struct {
// Limit is the number of times to retry the action.
// With limit = 3, there will be a max of 4 attempts for the action (initial attempt + 3 retries).
Limit int `json:"limit"`

// Duration is the duration to wait before retrying the action.
Duration string `json:"duration"`

// Jitter is the random factor to apply to the duration.
// Ranges from 0 to 100.
// +kubebuilder:validation:Minimum=0
// +kubebuilder:validation:Maximum=100
Jitter float64 `json:"jitter,omitempty"`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be int then ?


// Exponent is the exponential backoff configuration.
Exponent RetryExponent `json:"exponent"`
}

func (t PlaybookActionRetry) NextRetryWait(retryNumber int) (time.Duration, error) {
interval, err := duration.ParseDuration(t.Duration)
if err != nil {
return 0, fmt.Errorf("failed to parse duration(%s): %w", t.Duration, err)
}

nextWaitDuration := float64(interval) * math.Pow(t.Exponent.Multiplier, float64(retryNumber))

jitterFactor := 1 + ((rand.Float64()*2 - 1) * t.Jitter * 0.01) // Scales jitter within [-Jitter, +Jitter]
nextWaitDurationWithJitter := nextWaitDuration * jitterFactor

return time.Duration(nextWaitDurationWithJitter), nil
}

type PlaybookAction struct {
// delay is the parsed Delay
delay *time.Duration `json:"-" yaml:"-"`
Expand All @@ -526,6 +564,9 @@ type PlaybookAction struct {
// It's only sensitive to the minute. i.e. if you delay by 20s it can take upto a minute to execute.
Delay string `yaml:"delay,omitempty" json:"delay,omitempty"`

// Retry specifies the retry policy for the action.
Retry *PlaybookActionRetry `json:"retry,omitempty" yaml:"retry,omitempty"`

// Timeout is the maximum duration to let an action run before it's cancelled.
Timeout string `yaml:"timeout,omitempty" json:"timeout,omitempty"`

Expand Down
68 changes: 68 additions & 0 deletions api/v1/playbook_actions_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package v1

import (
"testing"
)

func TestNextRetryWait(t *testing.T) {
tests := []struct {
name string
RetryCount int
Retry PlaybookActionRetry
ExpectedRange []float64
ExpectedErr bool
}{
{
name: "no jitter",
RetryCount: 1,
ExpectedRange: []float64{45, 45},
Retry: PlaybookActionRetry{
Limit: 1,
Duration: "30s",
Exponent: RetryExponent{
Multiplier: 1.5,
},
Jitter: 0,
},
},
{
name: "no jitter second iteration",
RetryCount: 2,
ExpectedRange: []float64{67.5, 67.5},
Retry: PlaybookActionRetry{
Limit: 1,
Duration: "30s",
Exponent: RetryExponent{
Multiplier: 1.5,
},
Jitter: 0,
},
},
{
name: "with jitter second iteration",
RetryCount: 2,
ExpectedRange: []float64{60, 75},
Retry: PlaybookActionRetry{
Limit: 1,
Duration: "30s",
Exponent: RetryExponent{
Multiplier: 1.5,
},
Jitter: 10,
},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nextTime, err := tt.Retry.NextRetryWait(tt.RetryCount)
if (err != nil) != tt.ExpectedErr {
t.Errorf("expected error: %v, got error: %v", tt.ExpectedErr, err)
}

if nextTime.Seconds() < tt.ExpectedRange[0] || nextTime.Seconds() > tt.ExpectedRange[1] {
t.Errorf("expected next time to be between %f and %f, got %v", tt.ExpectedRange[0], tt.ExpectedRange[1], nextTime)
}
})
}
}
36 changes: 36 additions & 0 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 17 additions & 3 deletions cmd/playbook.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,17 +151,31 @@ var Run = &cobra.Command{
return
}

if action.Retry != nil {
if delay, err := action.Retry.NextRetryWait(1); err != nil {
logger.Errorf("error updating run delay: %v", err)
shutdown.ShutdownAndExit(1, err.Error())
return
} else {
fmt.Println(delay)
}
}

if action == nil {
logger.Errorf("No actions to run")
shutdown.ShutdownAndExit(1, err.Error())
return
}

for action != nil {
if delayed, err := runner.CheckDelay(ctx, *p, *run, action, step); err != nil {
ctx.Errorf("Error running action %s: %v", action.Name, err)
if delay, err := runner.GetDelay(ctx, *p, *run, action, step); err != nil {
ctx.Errorf("error getting delay %s: %v", action.Name, err)
break
} else if delayed {
} else if delay > 0 {
if err := run.Delay(ctx.DB(), delay); err != nil {
ctx.Errorf("error updating run delay: %v", err)
}

break
}

Expand Down
32 changes: 32 additions & 0 deletions config/crds/mission-control.flanksource.com_playbooks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1717,6 +1717,38 @@ spec:
- name
- spec
type: object
retry:
description: Retry specifies the retry policy for the action.
properties:
duration:
description: Duration is the duration to wait before retrying
the action.
type: string
exponent:
description: Exponent is the exponential backoff configuration.
properties:
multiplier:
type: number
required:
- multiplier
type: object
jitter:
description: |-
Jitter is the random factor to apply to the duration.
Ranges from 0 to 100.
maximum: 100
minimum: 0
type: number
limit:
description: |-
Limit is the number of times to retry the action.
With limit = 3, there will be a max of 4 attempts for the action (initial attempt + 3 retries).
type: integer
required:
- duration
- exponent
- limit
type: object
runsOn:
description: |-
RunsOn specifies the agents that can run this action.
Expand Down
38 changes: 38 additions & 0 deletions config/schemas/playbook-spec.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,9 @@
"delay": {
"type": "string"
},
"retry": {
"$ref": "#/$defs/PlaybookActionRetry"
},
"timeout": {
"type": "string"
},
Expand Down Expand Up @@ -789,6 +792,29 @@
"name"
]
},
"PlaybookActionRetry": {
"properties": {
"limit": {
"type": "integer"
},
"duration": {
"type": "string"
},
"jitter": {
"type": "number"
},
"exponent": {
"$ref": "#/$defs/RetryExponent"
}
},
"additionalProperties": false,
"type": "object",
"required": [
"limit",
"duration",
"exponent"
]
},
"PlaybookApproval": {
"properties": {
"type": {
Expand Down Expand Up @@ -1162,6 +1188,18 @@
},
"type": "array"
},
"RetryExponent": {
"properties": {
"multiplier": {
"type": "number"
}
},
"additionalProperties": false,
"type": "object",
"required": [
"multiplier"
]
},
"SQLAction": {
"properties": {
"connection": {
Expand Down
38 changes: 38 additions & 0 deletions config/schemas/playbook.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,9 @@
"delay": {
"type": "string"
},
"retry": {
"$ref": "#/$defs/PlaybookActionRetry"
},
"timeout": {
"type": "string"
},
Expand Down Expand Up @@ -938,6 +941,29 @@
"name"
]
},
"PlaybookActionRetry": {
"properties": {
"limit": {
"type": "integer"
},
"duration": {
"type": "string"
},
"jitter": {
"type": "number"
},
"exponent": {
"$ref": "#/$defs/RetryExponent"
}
},
"additionalProperties": false,
"type": "object",
"required": [
"limit",
"duration",
"exponent"
]
},
"PlaybookApproval": {
"properties": {
"type": {
Expand Down Expand Up @@ -1316,6 +1342,18 @@
},
"type": "array"
},
"RetryExponent": {
"properties": {
"multiplier": {
"type": "number"
}
},
"additionalProperties": false,
"type": "object",
"required": [
"multiplier"
]
},
"SQLAction": {
"properties": {
"connection": {
Expand Down
Loading
Loading