From b05b4caaf1bbe604a15492db7337011021263a0a Mon Sep 17 00:00:00 2001 From: Guilherme Santos <157053549+gsantos-hc@users.noreply.github.com> Date: Mon, 2 Dec 2024 10:42:36 -0500 Subject: [PATCH] feat: add agent injector telemetry (#703) * feat: add agent injector telemetry Add Prometheus metrics to monitor the Agent Injector's performance. New metrics include a gauge of current requests being processed by the webhook, a summary of request processing times, and a count of successful and failed injections by Kubernetes namespace. Successful injections are broken down by injection type. The `injection_type` label can assume the value `init_only` for injections with only an initContainer (no sidecar) and `sidecar` for all other cases (sidecar only or sidecar + initContainer). Fixes AG-005161. * refactor(metrics): add metadata to mutate response Update the `Mutate()` method to return a struct that extends the existing return data (AdmissionResponse) with metadata on the types of Vault Agent injections made. The metadata informs the count of injections by namespace, which are now further broken down by type of injection. --- agent-inject/handler.go | 66 ++++++++++++++++++++++------- agent-inject/handler_test.go | 2 +- agent-inject/metrics.go | 77 ++++++++++++++++++++++++++++++++++ subcommand/injector/command.go | 2 + 4 files changed, 132 insertions(+), 15 deletions(-) create mode 100644 agent-inject/metrics.go diff --git a/agent-inject/handler.go b/agent-inject/handler.go index 4eed7047..f30282cb 100644 --- a/agent-inject/handler.go +++ b/agent-inject/handler.go @@ -9,6 +9,7 @@ import ( "io" "net/http" "strings" + "time" "github.com/hashicorp/go-hclog" "github.com/hashicorp/vault-k8s/agent-inject/agent" @@ -85,6 +86,14 @@ type Handler struct { func (h *Handler) Handle(w http.ResponseWriter, r *http.Request) { h.Log.Info("Request received", "Method", r.Method, "URL", r.URL) + // Measure request processing duration and monitor request queue + requestQueue.Inc() + requestStart := time.Now() + defer func() { + requestProcessingTime.Observe(float64(time.Since(requestStart).Milliseconds())) + requestQueue.Dec() + }() + if ct := r.Header.Get("Content-Type"); ct != "application/json" { msg := fmt.Sprintf("Invalid content-type: %q", ct) http.Error(w, msg, http.StatusBadRequest) @@ -109,7 +118,10 @@ func (h *Handler) Handle(w http.ResponseWriter, r *http.Request) { return } - var admResp admissionv1.AdmissionReview + var ( + mutateResp MutateResponse + admResp admissionv1.AdmissionReview + ) // Both v1 and v1beta1 AdmissionReview types are exactly the same, so the v1beta1 type can // be decoded into the v1 type. However the runtime codec's decoder guesses which type to @@ -126,7 +138,8 @@ func (h *Handler) Handle(w http.ResponseWriter, r *http.Request) { h.Log.Error("error on request", "Error", msg, "Code", http.StatusInternalServerError) return } else { - admResp.Response = h.Mutate(admReq.Request) + mutateResp = h.Mutate(admReq.Request) + admResp.Response = mutateResp.Resp } // Default to a v1 AdmissionReview, otherwise the API server may not recognize the request @@ -142,26 +155,43 @@ func (h *Handler) Handle(w http.ResponseWriter, r *http.Request) { msg := fmt.Sprintf("error marshalling admission response: %s", err) http.Error(w, msg, http.StatusInternalServerError) h.Log.Error("error on request", "Error", msg, "Code", http.StatusInternalServerError) + incrementInjectionFailures(admReq.Request.Namespace) return } if _, err := w.Write(resp); err != nil { h.Log.Error("error writing response", "Error", err) + incrementInjectionFailures(admReq.Request.Namespace) + return } + + if admResp.Response.Allowed { + incrementInjections(admReq.Request.Namespace, mutateResp) + } else { + incrementInjectionFailures(admReq.Request.Namespace) + } +} + +type MutateResponse struct { + Resp *admissionv1.AdmissionResponse + InjectedInit bool + InjectedSidecar bool } // Mutate takes an admission request and performs mutation if necessary, // returning the final API response. -func (h *Handler) Mutate(req *admissionv1.AdmissionRequest) *admissionv1.AdmissionResponse { +func (h *Handler) Mutate(req *admissionv1.AdmissionRequest) MutateResponse { // Decode the pod from the request var pod corev1.Pod if err := json.Unmarshal(req.Object.Raw, &pod); err != nil { h.Log.Error("could not unmarshal request to pod: %s", err) h.Log.Debug("%s", req.Object.Raw) - return &admissionv1.AdmissionResponse{ - UID: req.UID, - Result: &metav1.Status{ - Message: err.Error(), + return MutateResponse{ + Resp: &admissionv1.AdmissionResponse{ + UID: req.UID, + Result: &metav1.Status{ + Message: err.Error(), + }, }, } } @@ -178,7 +208,9 @@ func (h *Handler) Mutate(req *admissionv1.AdmissionRequest) *admissionv1.Admissi err := fmt.Errorf("error checking if should inject agent: %s", err) return admissionError(req.UID, err) } else if !inject { - return resp + return MutateResponse{ + Resp: resp, + } } h.Log.Debug("checking namespaces..") @@ -249,14 +281,20 @@ func (h *Handler) Mutate(req *admissionv1.AdmissionRequest) *admissionv1.Admissi patchType := admissionv1.PatchTypeJSONPatch resp.PatchType = &patchType - return resp + return MutateResponse{ + Resp: resp, + InjectedInit: agentSidecar.PrePopulate, + InjectedSidecar: !agentSidecar.PrePopulateOnly, + } } -func admissionError(UID types.UID, err error) *admissionv1.AdmissionResponse { - return &admissionv1.AdmissionResponse{ - UID: UID, - Result: &metav1.Status{ - Message: err.Error(), +func admissionError(UID types.UID, err error) MutateResponse { + return MutateResponse{ + Resp: &admissionv1.AdmissionResponse{ + UID: UID, + Result: &metav1.Status{ + Message: err.Error(), + }, }, } } diff --git a/agent-inject/handler_test.go b/agent-inject/handler_test.go index 53fb7cc1..b21393f2 100644 --- a/agent-inject/handler_test.go +++ b/agent-inject/handler_test.go @@ -422,7 +422,7 @@ func TestHandlerHandle(t *testing.T) { for _, tt := range cases { t.Run(tt.Name, func(t *testing.T) { req := require.New(t) - resp := tt.Handler.Mutate(&tt.Req) + resp := (tt.Handler.Mutate(&tt.Req)).Resp if (tt.Err == "") != resp.Allowed { t.Fatalf("allowed: %v, expected err: %v", resp.Allowed, tt.Err) } diff --git a/agent-inject/metrics.go b/agent-inject/metrics.go new file mode 100644 index 00000000..b2d9c24f --- /dev/null +++ b/agent-inject/metrics.go @@ -0,0 +1,77 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +package agent_inject + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +const ( + metricsNamespace = "vault" + metricsSubsystem = "agent_injector" + metricsLabelNamespace = "namespace" + metricsLabelType = "injection_type" + metricsLabelTypeBoth = "init_and_sidecar" + metricsLabelTypeInitOnly = "init_only" + metricsLabelTypeSidecarOnly = "sidecar_only" +) + +var ( + requestQueue = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "request_queue_length", + Help: "Count of webhook requests in the injector's queue", + }) + + requestProcessingTime = prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "request_processing_duration_ms", + Help: "Webhook request processing times in milliseconds", + Buckets: []float64{5, 10, 25, 50, 75, 100, 250, 500, 1000, 2500, 5000, 7500, 10000}, + }) + + injectionsByNamespace = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "injections_by_namespace_total", + Help: "Total count of Agent Sidecar injections by namespace", + }, []string{metricsLabelNamespace, metricsLabelType}) + + failedInjectionsByNamespace = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "failed_injections_by_namespace_total", + Help: "Total count of failed Agent Sidecar injections by namespace", + }, []string{metricsLabelNamespace}) +) + +func incrementInjections(namespace string, res MutateResponse) { + // Injection type can be one of: init_and_sidecar (default); init_only; or sidecar_only + typeLabel := metricsLabelTypeBoth + if res.InjectedInit && !res.InjectedSidecar { + typeLabel = metricsLabelTypeInitOnly + } else if res.InjectedSidecar && !res.InjectedInit { + typeLabel = metricsLabelTypeSidecarOnly + } + + injectionsByNamespace.With(prometheus.Labels{ + metricsLabelNamespace: namespace, + metricsLabelType: typeLabel, + }).Inc() +} + +func incrementInjectionFailures(namespace string) { + failedInjectionsByNamespace.With(prometheus.Labels{metricsLabelNamespace: namespace}).Inc() +} + +func MustRegisterInjectorMetrics(registry prometheus.Registerer) { + registry.MustRegister( + requestQueue, + requestProcessingTime, + injectionsByNamespace, + failedInjectionsByNamespace, + ) +} diff --git a/subcommand/injector/command.go b/subcommand/injector/command.go index 94e27e8d..75bcd0a5 100644 --- a/subcommand/injector/command.go +++ b/subcommand/injector/command.go @@ -27,6 +27,7 @@ import ( "github.com/hashicorp/vault-k8s/leader" "github.com/hashicorp/vault-k8s/version" "github.com/mitchellh/cli" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" adminv1 "k8s.io/api/admissionregistration/v1" adminv1beta "k8s.io/api/admissionregistration/v1beta1" @@ -231,6 +232,7 @@ func (c *Command) Run(args []string) int { // Registering path to expose metrics if c.flagTelemetryPath != "" { + agentInject.MustRegisterInjectorMetrics(prometheus.DefaultRegisterer) c.UI.Info(fmt.Sprintf("Registering telemetry path on %q", c.flagTelemetryPath)) mux.Handle(c.flagTelemetryPath, promhttp.Handler()) }