Skip to content

Commit

Permalink
feat: tune leader elect timeouts (#75)
Browse files Browse the repository at this point in the history
* feat: tune leader elect timeouts

* add more logs

* add more logs

* add more logs

* add more logs

* add more logs

* add more logs

* add more logs

* add more logs

* add more logs

* no more logs - not working
  • Loading branch information
aldor007 authored Jul 31, 2023
1 parent bc07d34 commit 7b617fc
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 10 deletions.
6 changes: 4 additions & 2 deletions health/healthz.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (
type HealthzCfg struct {
// Max time between successful poll actions to consider cluster-controller alive
HealthyPollIntervalLimit time.Duration
// Max wait time for application to start
StartTimeLimit time.Duration
}

func NewHealthzProvider(cfg HealthzCfg, log logrus.FieldLogger) *HealthzProvider {
Expand Down Expand Up @@ -43,8 +45,8 @@ func (h *HealthzProvider) Check(_ *http.Request) (err error) {
}

if h.initStartedAt != nil {
if time.Since(*h.initStartedAt) > h.cfg.HealthyPollIntervalLimit {
return fmt.Errorf("there was no sucessful poll action since start of application %s", h.cfg.HealthyPollIntervalLimit)
if time.Since(*h.initStartedAt) > h.cfg.StartTimeLimit {
return fmt.Errorf("there was no sucessful poll action since start of application %s", h.cfg.StartTimeLimit)
}
return nil
}
Expand Down
9 changes: 7 additions & 2 deletions health/healthz_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ func TestNewHealthzProvider(t *testing.T) {
t.Run("unhealthy statuses", func(t *testing.T) {

log := logrus.New()
h := NewHealthzProvider(HealthzCfg{HealthyPollIntervalLimit: time.Millisecond}, log)

t.Run("should return initialize timeout error", func(t *testing.T) {
r := require.New(t)
h := NewHealthzProvider(HealthzCfg{HealthyPollIntervalLimit: time.Millisecond, StartTimeLimit: time.Millisecond}, log)
h.Initializing()

time.Sleep(5 * time.Millisecond)
Expand All @@ -26,6 +26,7 @@ func TestNewHealthzProvider(t *testing.T) {

t.Run("should return action pool timeout error", func(t *testing.T) {
r := require.New(t)
h := NewHealthzProvider(HealthzCfg{HealthyPollIntervalLimit: time.Millisecond, StartTimeLimit: time.Millisecond}, log)
h.ActionPoll()

time.Sleep(5 * time.Millisecond)
Expand All @@ -35,16 +36,18 @@ func TestNewHealthzProvider(t *testing.T) {
})

t.Run("healthy statuses", func(t *testing.T) {

log := logrus.New()
h := NewHealthzProvider(HealthzCfg{HealthyPollIntervalLimit: 2 * time.Second}, log)

t.Run("cluster-controller is considered healthy before initialization", func(t *testing.T) {
r := require.New(t)
h := NewHealthzProvider(HealthzCfg{HealthyPollIntervalLimit: 2 * time.Second, StartTimeLimit: time.Millisecond}, log)

r.NoError(h.Check(nil))
})

t.Run("should return no error when still initializing", func(t *testing.T) {
h := NewHealthzProvider(HealthzCfg{HealthyPollIntervalLimit: 2 * time.Second, StartTimeLimit: time.Millisecond}, log)
h.Initializing()
r := require.New(t)

Expand All @@ -53,9 +56,11 @@ func TestNewHealthzProvider(t *testing.T) {

t.Run("should return no error when time since last action pool has not been long", func(t *testing.T) {
r := require.New(t)
h := NewHealthzProvider(HealthzCfg{HealthyPollIntervalLimit: 2 * time.Second, StartTimeLimit: time.Millisecond}, log)
h.ActionPoll()

r.NoError(h.Check(nil))
})
})

}
15 changes: 9 additions & 6 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ var (
Version = "local"
)

const leaderLeaseDuration = time.Second * 15

func main() {
cfg := config.Get()

Expand Down Expand Up @@ -130,7 +132,7 @@ func run(
Version: binVersion.Version,
Namespace: cfg.LeaderElection.Namespace,
}
healthzAction := health.NewHealthzProvider(health.HealthzCfg{HealthyPollIntervalLimit: (actionsConfig.PollWaitInterval + actionsConfig.PollTimeout) * 2}, log)
healthzAction := health.NewHealthzProvider(health.HealthzCfg{HealthyPollIntervalLimit: (actionsConfig.PollWaitInterval + actionsConfig.PollTimeout) * 2, StartTimeLimit: 2 * time.Minute}, log)

svc := actions.NewService(
log,
Expand All @@ -147,7 +149,7 @@ func run(
checks = append(checks, healthzAction)
var leaderHealthCheck *leaderelection.HealthzAdaptor
if cfg.LeaderElection.Enabled {
leaderHealthCheck = leaderelection.NewLeaderHealthzAdaptor(time.Minute * 2)
leaderHealthCheck = leaderelection.NewLeaderHealthzAdaptor(time.Minute)
checks = append(checks, leaderHealthCheck)
}
healthz.InstallHandler(httpMux, checks...)
Expand Down Expand Up @@ -206,9 +208,9 @@ func runWithLeaderElection(
// get elected before your background loop finished, violating
// the stated goal of the lease.
ReleaseOnCancel: true,
LeaseDuration: 60 * time.Second,
RenewDeadline: 15 * time.Second,
RetryPeriod: 5 * time.Second,
LeaseDuration: leaderLeaseDuration,
RenewDeadline: 10 * time.Second,
RetryPeriod: 3 * time.Second,
WatchDog: watchDog,
Callbacks: leaderelection.LeaderCallbacks{
OnStartedLeading: func(ctx context.Context) {
Expand All @@ -217,7 +219,8 @@ func runWithLeaderElection(
},
OnStoppedLeading: func() {
log.Infof("leader lost: %s", id)
os.Exit(0)
// We don't need to exit here.
// Leader "on started leading" receive a context that gets cancelled when you're no longer the leader.
},
OnNewLeader: func(identity string) {
// We're notified when new leader elected.
Expand Down

0 comments on commit 7b617fc

Please sign in to comment.