From c01bfa80ef9515d339dc21e428bde796214e457c Mon Sep 17 00:00:00 2001 From: Sam DeHaan Date: Fri, 15 Nov 2024 09:29:48 -0500 Subject: [PATCH] Capture second metrics sample in support bundle to provide metrics delta for investigating issues (#2085) * Capture second metrics sample to provide metrics delta for investigating issues * Update names of metrics samples --- CHANGELOG.md | 4 + docs/sources/troubleshoot/support_bundle.md | 3 +- internal/service/http/supportbundle.go | 109 +++++++++++--------- 3 files changed, 66 insertions(+), 50 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 99a49d0eb2..69682dd19c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,10 @@ Main (unreleased) - Add `otelcol.receiver.solace` component to receive traces from a Solace broker. (@wildum) +### Enhancements + +- Add second metrics sample to the support bundle to provide delta information (@dehaansa) + ### Bugfixes - Fixed an issue in the `prometheus.exporter.postgres` component that would leak goroutines when the target was not reachable (@dehaansa) diff --git a/docs/sources/troubleshoot/support_bundle.md b/docs/sources/troubleshoot/support_bundle.md index 2bb870bc5b..d38c9dd41b 100644 --- a/docs/sources/troubleshoot/support_bundle.md +++ b/docs/sources/troubleshoot/support_bundle.md @@ -38,7 +38,8 @@ A support bundle contains the following data: `/api/v0/web/components` endpoint. * `alloy-logs.txt` contains the logs during the bundle generation. * `alloy-metadata.yaml` contains the {{< param "PRODUCT_NAME" >}} build version and the installation's operating system, architecture, and uptime. -* `alloy-metrics.txt` contains a snapshot of the internal metrics for {{< param "PRODUCT_NAME" >}}. +* `alloy-metrics-sample-start.txt` contains a snapshot of the internal metrics for {{< param "PRODUCT_NAME" >}} at the start of the bundle collection. +* `alloy-metrics-sample-end.txt` contains a snapshot of the internal metrics for {{< param "PRODUCT_NAME" >}} at the end of the bundle collection. * `alloy-peers.json` contains information about the identified cluster peers of this {{< param "PRODUCT_NAME" >}} instance, generated by the `/api/v0/web/peers` endpoint. * `alloy-runtime-flags.txt` contains the values of the runtime flags available in {{< param "PRODUCT_NAME" >}}. diff --git a/internal/service/http/supportbundle.go b/internal/service/http/supportbundle.go index 3c75c35150..ac0898ce5a 100644 --- a/internal/service/http/supportbundle.go +++ b/internal/service/http/supportbundle.go @@ -28,16 +28,17 @@ type SupportBundleContext struct { // Bundle collects all the data that is exposed as a support bundle. type Bundle struct { - meta []byte - alloyMetrics []byte - components []byte - peers []byte - runtimeFlags []byte - heapBuf *bytes.Buffer - goroutineBuf *bytes.Buffer - blockBuf *bytes.Buffer - mutexBuf *bytes.Buffer - cpuBuf *bytes.Buffer + meta []byte + alloyMetricsStart []byte + alloyMetricsEnd []byte + components []byte + peers []byte + runtimeFlags []byte + heapBuf *bytes.Buffer + goroutineBuf *bytes.Buffer + blockBuf *bytes.Buffer + mutexBuf *bytes.Buffer + cpuBuf *bytes.Buffer } // Metadata contains general runtime information about the current Alloy environment. @@ -50,6 +51,26 @@ type Metadata struct { // ExportSupportBundle gathers the information required for the support bundle. func ExportSupportBundle(ctx context.Context, runtimeFlags []string, srvAddress string, dialContext server.DialContextFunc) (*Bundle, error) { + var httpClient http.Client + httpClient.Transport = &http.Transport{DialContext: dialContext} + + // Gather Alloy's own metrics. + alloyMetricsStart, err := retrieveAPIEndpoint(httpClient, srvAddress, "metrics") + if err != nil { + return nil, fmt.Errorf("failed to get internal Alloy metrics: %s", err) + } + + // Gather running component configuration + components, err := retrieveAPIEndpoint(httpClient, srvAddress, "api/v0/web/components") + if err != nil { + return nil, fmt.Errorf("failed to get component details: %s", err) + } + // Gather cluster peers information + peers, err := retrieveAPIEndpoint(httpClient, srvAddress, "api/v0/web/peers") + if err != nil { + return nil, fmt.Errorf("failed to get peer details: %s", err) + } + // The block profiler is disabled by default. Temporarily enable recording // of all blocking events. Also, temporarily record all mutex contentions, // and defer restoring of earlier mutex profiling fraction. @@ -76,24 +97,6 @@ func ExportSupportBundle(ctx context.Context, runtimeFlags []string, srvAddress return nil, fmt.Errorf("failed to marshal support bundle metadata: %s", err) } - var httpClient http.Client - httpClient.Transport = &http.Transport{DialContext: dialContext} - // Gather Alloy's own metrics. - alloyMetrics, err := retrieveAPIEndpoint(httpClient, srvAddress, "metrics") - if err != nil { - return nil, fmt.Errorf("failed to get internal Alloy metrics: %s", err) - } - // Gather running component configuration - components, err := retrieveAPIEndpoint(httpClient, srvAddress, "api/v0/web/components") - if err != nil { - return nil, fmt.Errorf("failed to get component details: %s", err) - } - // Gather cluster peers information - peers, err := retrieveAPIEndpoint(httpClient, srvAddress, "api/v0/web/peers") - if err != nil { - return nil, fmt.Errorf("failed to get peer details: %s", err) - } - // Export pprof data. var ( cpuBuf bytes.Buffer @@ -129,19 +132,26 @@ func ExportSupportBundle(ctx context.Context, runtimeFlags []string, srvAddress return nil, err } + // Gather Alloy's own metrics after the profile completes + alloyMetricsEnd, err := retrieveAPIEndpoint(httpClient, srvAddress, "metrics") + if err != nil { + return nil, fmt.Errorf("failed to get internal Alloy metrics: %s", err) + } + // Finally, bundle everything up to be served, either as a zip from // memory, or exported to a directory. bundle := &Bundle{ - meta: meta, - alloyMetrics: alloyMetrics, - components: components, - peers: peers, - runtimeFlags: []byte(strings.Join(runtimeFlags, "\n")), - heapBuf: &heapBuf, - goroutineBuf: &goroutineBuf, - blockBuf: &blockBuf, - mutexBuf: &mutexBuf, - cpuBuf: &cpuBuf, + meta: meta, + alloyMetricsStart: alloyMetricsStart, + alloyMetricsEnd: alloyMetricsEnd, + components: components, + peers: peers, + runtimeFlags: []byte(strings.Join(runtimeFlags, "\n")), + heapBuf: &heapBuf, + goroutineBuf: &goroutineBuf, + blockBuf: &blockBuf, + mutexBuf: &mutexBuf, + cpuBuf: &cpuBuf, } return bundle, nil @@ -169,17 +179,18 @@ func ServeSupportBundle(rw http.ResponseWriter, b *Bundle, logsBuf *bytes.Buffer rw.Header().Set("Content-Disposition", "attachment; filename=\"alloy-support-bundle.zip\"") zipStructure := map[string][]byte{ - "alloy-metadata.yaml": b.meta, - "alloy-components.json": b.components, - "alloy-peers.json": b.peers, - "alloy-metrics.txt": b.alloyMetrics, - "alloy-runtime-flags.txt": b.runtimeFlags, - "alloy-logs.txt": logsBuf.Bytes(), - "pprof/cpu.pprof": b.cpuBuf.Bytes(), - "pprof/heap.pprof": b.heapBuf.Bytes(), - "pprof/goroutine.pprof": b.goroutineBuf.Bytes(), - "pprof/mutex.pprof": b.mutexBuf.Bytes(), - "pprof/block.pprof": b.blockBuf.Bytes(), + "alloy-metadata.yaml": b.meta, + "alloy-components.json": b.components, + "alloy-peers.json": b.peers, + "alloy-metrics-sample-start.txt": b.alloyMetricsStart, + "alloy-metrics-sample-end.txt": b.alloyMetricsEnd, + "alloy-runtime-flags.txt": b.runtimeFlags, + "alloy-logs.txt": logsBuf.Bytes(), + "pprof/cpu.pprof": b.cpuBuf.Bytes(), + "pprof/heap.pprof": b.heapBuf.Bytes(), + "pprof/goroutine.pprof": b.goroutineBuf.Bytes(), + "pprof/mutex.pprof": b.mutexBuf.Bytes(), + "pprof/block.pprof": b.blockBuf.Bytes(), } for fn, b := range zipStructure {