Skip to content

Commit

Permalink
Add telemetry job (#1448)
Browse files Browse the repository at this point in the history
Problem:

We want to have a telemetry job that periodically reports product
telemetry every 24h. For now, telemetry data is empty and report is sent
to the debug log.

Solution:

- Refactor leader election to use controller-runtime manager
capabilities. This simplifies the existing code and make it easier to
add a telemetry Job.
- Add a telemetry Job that periodically reports empty telemetry to
the debug log.
- Make the period configurable at build time via TELEMETRY_REPORT_PERIOD
Makefile variable.

Note: leader elector refactoring changes behavior of NGF process
when leadership gets lost:
Before: the Manager would shutdown waiting for the runnables to exit.
After: the Manager doesn't wait. It similar to NGF process panicing.
This should be OK, as NGF container will restart and recover any
potentially broken state (update not fully populated statuses, restore
correct NGINX configuration).

Testing:
- Unit tests
- Manual testing:
  - Ensure leader election works as expected - both leader and
    non-pods run successfully.
  - Ensure NGF container exits when stop being leader.
  - Ensure an upgrade from Release 1.1.0 is successful for leader
    election - the leader gets elected among the new pods.
  - Ensure the telemetry Job reports telemetry multiple times, using
  a small value of ELEMETRY_REPORT_PERIOD

CLOSES #1382

Co-authored-by: Saylor Berman <[email protected]>
  • Loading branch information
pleshakov and sjberman authored Jan 10, 2024
1 parent ca6c2ff commit 9d9c1f2
Show file tree
Hide file tree
Showing 18 changed files with 479 additions and 128 deletions.
2 changes: 2 additions & 0 deletions .goreleaser.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ builds:
- all=-trimpath={{.Env.GOPATH}}
asmflags:
- all=-trimpath={{.Env.GOPATH}}
ldflags:
- -s -w -X main.version={{.Version}} -X main.commit={{.Commit}} -X main.date={{.Date}} -X main.telemetryReportPeriod=24h
main: ./cmd/gateway/
binary: gateway

Expand Down
1 change: 1 addition & 0 deletions .yamllint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ rules:
deploy/manifests/nginx-gateway.yaml
deploy/manifests/crds
tests/longevity/manifests/cronjob.yaml
.goreleaser.yml
new-line-at-end-of-file: enable
new-lines: enable
octal-values: disable
Expand Down
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ NGINX_CONF_DIR = internal/mode/static/nginx/conf
NJS_DIR = internal/mode/static/nginx/modules/src
NGINX_DOCKER_BUILD_PLUS_ARGS = --secret id=nginx-repo.crt,src=nginx-repo.crt --secret id=nginx-repo.key,src=nginx-repo.key
BUILD_AGENT=local
TELEMETRY_REPORT_PERIOD = 24h # also configured in goreleaser.yml
GW_API_VERSION = 1.0.0
INSTALL_WEBHOOK = false

# go build flags - should not be overridden by the user
GO_LINKER_FlAGS_VARS = -X main.version=${VERSION} -X main.commit=${GIT_COMMIT} -X main.date=${DATE}
GO_LINKER_FlAGS_VARS = -X main.version=${VERSION} -X main.commit=${GIT_COMMIT} -X main.date=${DATE} -X main.telemetryReportPeriod=${TELEMETRY_REPORT_PERIOD}
GO_LINKER_FLAGS_OPTIMIZATIONS = -s -w
GO_LINKER_FLAGS = $(GO_LINKER_FLAGS_OPTIMIZATIONS) $(GO_LINKER_FlAGS_VARS)

Expand Down
8 changes: 7 additions & 1 deletion cmd/gateway/commands.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,11 @@ func createStaticModeCommand() *cobra.Command {
return errors.New("POD_NAME environment variable must be set")
}

period, err := time.ParseDuration(telemetryReportPeriod)
if err != nil {
return fmt.Errorf("error parsing telemetry report period: %w", err)
}

var gwNsName *types.NamespacedName
if cmd.Flags().Changed(gatewayFlag) {
gwNsName = &gateway.value
Expand Down Expand Up @@ -163,7 +168,8 @@ func createStaticModeCommand() *cobra.Command {
LockName: leaderElectionLockName.String(),
Identity: podName,
},
Plus: plus,
Plus: plus,
TelemetryReportPeriod: period,
}

if err := static.StartManager(conf); err != nil {
Expand Down
5 changes: 4 additions & 1 deletion cmd/gateway/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@ import (
"os"
)

// Set during go build
var (
// Set during go build
version string
commit string
date string

// telemetryReportPeriod is the period at which telemetry reports are sent.
telemetryReportPeriod string
)

func main() {
Expand Down
5 changes: 5 additions & 0 deletions internal/framework/runnables/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
/*
Package runnables provides helper types for creating runnables for the controller-runtime manager when
leader election is enabled.
*/
package runnables
62 changes: 62 additions & 0 deletions internal/framework/runnables/runnables.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package runnables

import (
"context"

"sigs.k8s.io/controller-runtime/pkg/manager"
)

// Leader is a Runnable that needs to be run only when the current instance is the leader.
type Leader struct {
manager.Runnable
}

var (
_ manager.LeaderElectionRunnable = &Leader{}
_ manager.Runnable = &Leader{}
)

func (r *Leader) NeedLeaderElection() bool {
return true
}

// LeaderOrNonLeader is a Runnable that needs to be run regardless of whether the current instance is the leader.
type LeaderOrNonLeader struct {
manager.Runnable
}

var (
_ manager.LeaderElectionRunnable = &LeaderOrNonLeader{}
_ manager.Runnable = &LeaderOrNonLeader{}
)

func (r *LeaderOrNonLeader) NeedLeaderElection() bool {
return false
}

// EnableAfterBecameLeader is a Runnable that will call the enable function when the current instance becomes
// the leader.
type EnableAfterBecameLeader struct {
enable func(context.Context)
}

var (
_ manager.LeaderElectionRunnable = &EnableAfterBecameLeader{}
_ manager.Runnable = &EnableAfterBecameLeader{}
)

// NewEnableAfterBecameLeader creates a new EnableAfterBecameLeader Runnable.
func NewEnableAfterBecameLeader(enable func(context.Context)) *EnableAfterBecameLeader {
return &EnableAfterBecameLeader{
enable: enable,
}
}

func (j *EnableAfterBecameLeader) Start(ctx context.Context) error {
j.enable(ctx)
return nil
}

func (j *EnableAfterBecameLeader) NeedLeaderElection() bool {
return true
}
38 changes: 38 additions & 0 deletions internal/framework/runnables/runnables_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package runnables

import (
"context"
"testing"

. "github.com/onsi/gomega"
)

func TestLeader(t *testing.T) {
leader := &Leader{}

g := NewWithT(t)
g.Expect(leader.NeedLeaderElection()).To(BeTrue())
}

func TestLeaderOrNonLeader(t *testing.T) {
leaderOrNonLeader := &LeaderOrNonLeader{}

g := NewWithT(t)
g.Expect(leaderOrNonLeader.NeedLeaderElection()).To(BeFalse())
}

func TestEnableAfterBecameLeader(t *testing.T) {
enabled := false
enableAfterBecameLeader := NewEnableAfterBecameLeader(func(_ context.Context) {
enabled = true
})

g := NewWithT(t)
g.Expect(enableAfterBecameLeader.NeedLeaderElection()).To(BeTrue())
g.Expect(enabled).To(BeFalse())

err := enableAfterBecameLeader.Start(context.Background())
g.Expect(err).To(BeNil())

g.Expect(enabled).To(BeTrue())
}
4 changes: 4 additions & 0 deletions internal/mode/static/config/config.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package config

import (
"time"

"github.com/go-logr/logr"
"go.uber.org/zap"
"k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -32,6 +34,8 @@ type Config struct {
MetricsConfig MetricsConfig
// HealthConfig specifies the health probe config.
HealthConfig HealthConfig
// TelemetryReportPeriod is the period at which telemetry reports are sent.
TelemetryReportPeriod time.Duration
}

// GatewayPodConfig contains information about this Pod.
Expand Down
102 changes: 0 additions & 102 deletions internal/mode/static/leader.go

This file was deleted.

56 changes: 33 additions & 23 deletions internal/mode/static/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"k8s.io/client-go/tools/record"
ctlr "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
ctrlcfg "sigs.k8s.io/controller-runtime/pkg/config"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/metrics"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
Expand All @@ -34,6 +35,8 @@ import (
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/controller/predicate"
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/events"
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/gatewayclass"
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/helpers"
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/runnables"
"github.com/nginxinc/nginx-gateway-fabric/internal/framework/status"
"github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/config"
"github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/metrics/collectors"
Expand All @@ -45,6 +48,7 @@ import (
"github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/state/relationship"
"github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/state/resolver"
"github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/state/validation"
"github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/telemetry"
)

const (
Expand All @@ -69,6 +73,21 @@ func StartManager(cfg config.Config) error {
Scheme: scheme,
Logger: cfg.Logger,
Metrics: getMetricsOptions(cfg.MetricsConfig),
// Note: when the leadership is lost, the manager will return an error in the Start() method.
// However, it will not wait for any Runnable it starts to finish, meaning any in-progress operations
// might get terminated half-way.
LeaderElection: true,
LeaderElectionNamespace: cfg.GatewayPodConfig.Namespace,
LeaderElectionID: cfg.LeaderElection.LockName,
// We're not enabling LeaderElectionReleaseOnCancel because when the Manager stops gracefully, it waits
// for all started Runnables (including Leader-only ones) to finish. Otherwise, the new leader might start
// running Leader-only Runnables before the old leader has finished running them.
// See the doc comment for the LeaderElectionReleaseOnCancel for more details.
LeaderElectionReleaseOnCancel: false,
Controller: ctrlcfg.Controller{
// All of our controllers still need to work in case of non-leader pods
NeedLeaderElection: helpers.GetPointer(false),
},
}

if cfg.HealthConfig.Enabled {
Expand Down Expand Up @@ -211,35 +230,26 @@ func StartManager(cfg config.Config) error {
firstBatchPreparer,
)

if err = mgr.Add(eventLoop); err != nil {
if err = mgr.Add(&runnables.LeaderOrNonLeader{Runnable: eventLoop}); err != nil {
return fmt.Errorf("cannot register event loop: %w", err)
}

leaderElectorLogger := cfg.Logger.WithName("leaderElector")
if err = mgr.Add(runnables.NewEnableAfterBecameLeader(statusUpdater.Enable)); err != nil {
return fmt.Errorf("cannot register status updater: %w", err)
}

if cfg.LeaderElection.Enabled {
leaderElector, err := newLeaderElectorRunnable(leaderElectorRunnableConfig{
kubeConfig: clusterCfg,
recorder: recorder,
onStartedLeading: func(ctx context.Context) {
leaderElectorLogger.Info("Started leading")
statusUpdater.Enable(ctx)
telemetryJob := &runnables.Leader{
Runnable: telemetry.NewJob(
telemetry.JobConfig{
Exporter: telemetry.NewLoggingExporter(cfg.Logger.WithName("telemetryExporter").V(1 /* debug */)),
Logger: cfg.Logger.WithName("telemetryJob"),
Period: cfg.TelemetryReportPeriod,
},
onStoppedLeading: func() {
leaderElectorLogger.Info("Stopped leading")
statusUpdater.Disable()
},
lockNs: cfg.GatewayPodConfig.Namespace,
lockName: cfg.LeaderElection.LockName,
identity: cfg.LeaderElection.Identity,
})
if err != nil {
return err
}
),
}

if err = mgr.Add(leaderElector); err != nil {
return fmt.Errorf("cannot register leader elector: %w", err)
}
if err = mgr.Add(telemetryJob); err != nil {
return fmt.Errorf("cannot register telemetry job: %w", err)
}

cfg.Logger.Info("Starting manager")
Expand Down
4 changes: 4 additions & 0 deletions internal/mode/static/telemetry/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/*
Package telemetry is responsible for collecting and sending product telemetry data.
*/
package telemetry
Loading

0 comments on commit 9d9c1f2

Please sign in to comment.