From d06eafd19a932e3afc74804f19a3e82a6269f0d7 Mon Sep 17 00:00:00 2001 From: Stephen Gutekanst Date: Thu, 20 Jun 2024 15:41:50 -0700 Subject: [PATCH] reduce potential lock contention; avoid goroutine leak Hitting the wrench process with `kill -SIGQUIT ` showed in the goroutine stack trace dump that one goroutine was hung on an HTTP request doomed to never finish. Unfortunately, that also meant it held the mutex used to prevent concurrent writes forever - which meant the cache was unable to add new Zig versions - leading to bug reports like https://github.com/mlugg/setup-zig/issues/5#issuecomment-2181355145 indicating fetching a new Zig version was impossible. This fixes it by reducing the lock contention, and using an HTTP client with an actual timeout (Go's defaults suck here, crazy that the default function for 'make an GET request' can hang forever and leak a goroutine, or worse as in this case.) ``` goroutine 25991611 [select, 1612 minutes]: runtime.gopark(0xc00040e9f0?, 0x4?, 0xa8?, 0x83?, 0xc00040e948?) /opt/homebrew/Cellar/go/1.21.6/libexec/src/runtime/proc.go:398 +0xce fp=0xc00040e7e0 sp=0xc00040e7c0 pc=0x43be2e runtime.selectgo(0xc00040e9f0, 0xc00040e940, 0x3?, 0x0, 0x4159fa?, 0x1) /opt/homebrew/Cellar/go/1.21.6/libexec/src/runtime/select.go:327 +0x725 fp=0xc00040e900 sp=0xc00040e7e0 pc=0x44b8e5 net/http.(*http2ClientConn).RoundTrip(0xc000446000, 0xc000662300) /opt/homebrew/Cellar/go/1.21.6/libexec/src/net/http/h2_bundle.go:8381 +0x44f fp=0xc00040ea40 sp=0xc00040e900 pc=0x6cf4cf net/http.(*http2Transport).RoundTripOpt(0xc000128360, 0xc000662300, {0x70?}) /opt/homebrew/Cellar/go/1.21.6/libexec/src/net/http/h2_bundle.go:7649 +0x1b1 fp=0xc00040eb70 sp=0xc00040ea40 pc=0x6cb431 net/http.(*http2Transport).RoundTrip(0x16b3000?, 0xc0001479c0?) /opt/homebrew/Cellar/go/1.21.6/libexec/src/net/http/h2_bundle.go:7598 +0x15 fp=0xc00040eb98 sp=0xc00040eb70 pc=0x6cafb5 net/http.(*Transport).roundTrip(0x16b3000, 0xc000662300) /opt/homebrew/Cellar/go/1.21.6/libexec/src/net/http/transport.go:602 +0x78b fp=0xc00040edc0 sp=0xc00040eb98 pc=0x6fd3eb net/http.(*Transport).RoundTrip(0x0?, 0xf98720?) /opt/homebrew/Cellar/go/1.21.6/libexec/src/net/http/roundtrip.go:17 +0x13 fp=0xc00040ede0 sp=0xc00040edc0 pc=0x6e6253 net/http.send(0xc000662300, {0xf98720, 0x16b3000}, {0x1?, 0xffffffffffffffff?, 0x0?}) /opt/homebrew/Cellar/go/1.21.6/libexec/src/net/http/client.go:260 +0x606 fp=0xc00040f010 sp=0xc00040ede0 pc=0x6a4166 net/http.(*Client).send(0x16d3380, 0xc000662300, {0xc00040f0c0?, 0x5f6bc5?, 0x0?}) /opt/homebrew/Cellar/go/1.21.6/libexec/src/net/http/client.go:181 +0x98 fp=0xc00040f088 sp=0xc00040f010 pc=0x6a39f8 net/http.(*Client).do(0x16d3380, 0xc000662300) /opt/homebrew/Cellar/go/1.21.6/libexec/src/net/http/client.go:724 +0x912 fp=0xc00040f298 sp=0xc00040f088 pc=0x6a5d32 net/http.(*Client).Do(...) /opt/homebrew/Cellar/go/1.21.6/libexec/src/net/http/client.go:590 net/http.(*Client).Get(0xf97a60?, {0xc00032f450?, 0xe5a75c?}) /opt/homebrew/Cellar/go/1.21.6/libexec/src/net/http/client.go:488 +0x5f fp=0xc00040f2f0 sp=0xc00040f298 pc=0x6a52bf net/http.Get(...) /opt/homebrew/Cellar/go/1.21.6/libexec/src/net/http/client.go:457 github.com/hexops/wrench/internal/wrench.(*Bot).httpPkgEnsureZigDownloadCached(0xc0000979a0, {0xc0005d4f5a, 0x19}, {0xe50bab, 0x4}, {0xc0005d4f49, 0x2a}) /Volumes/data/hexops/wrench/internal/wrench/http_pkg.go:417 +0x8a7 fp=0xc00040f7a0 sp=0xc00040f2f0 pc=0xc42467 ``` Signed-off-by: Stephen Gutekanst --- internal/wrench/http_pkg.go | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/internal/wrench/http_pkg.go b/internal/wrench/http_pkg.go index 5c0e0f0..dc20556 100644 --- a/internal/wrench/http_pkg.go +++ b/internal/wrench/http_pkg.go @@ -1,6 +1,7 @@ package wrench import ( + "context" "encoding/json" "fmt" "io" @@ -355,6 +356,7 @@ func (b *Bot) httpPkgEnsureZigVersionCached(version, versionKind string) { var ( cachedResponsesMu sync.Mutex cachedResponses = map[string]error{} + fsParallelismLock sync.Mutex ) func (b *Bot) httpPkgEnsureZigDownloadCached(version, versionKind, fname string) error { @@ -406,15 +408,15 @@ func (b *Bot) httpPkgEnsureZigDownloadCached(version, versionKind, fname string) logWriter := b.idWriter("zig") cachedResponsesMu.Lock() - defer cachedResponsesMu.Unlock() - - if cachedError, ok := cachedResponses[url]; ok { + cachedError, isCachedError := cachedResponses[url] + cachedResponsesMu.Unlock() + if isCachedError { fmt.Fprintf(logWriter, "not fetching: %s (cached error %s)\n", url, cachedError) return cachedError } fmt.Fprintf(logWriter, "fetch: %s > %s\n", url, filePath) - resp, err := http.Get(url) + resp, err := httpGet(url, 60*time.Second) if err != nil { return errors.Wrap(err, "Get") } @@ -423,13 +425,20 @@ func (b *Bot) httpPkgEnsureZigDownloadCached(version, versionKind, fname string) if resp.StatusCode >= 400 && resp.StatusCode <= 500 { // 404 not found, 403 forbidden, etc. err := fmt.Errorf("bad response status: %s", resp.Status) + cachedResponsesMu.Lock() cachedResponses[url] = err + cachedResponsesMu.Unlock() return err } if resp.StatusCode != http.StatusOK { return fmt.Errorf("bad response status: %s", resp.Status) } + // Two goroutines may have fetched this file at the same time (assuming neither hit the cache, + // new file download) - which is fine - but we can't have them write to disk at the same time. + fsParallelismLock.Lock() + defer fsParallelismLock.Unlock() + if err := os.MkdirAll(dirPath, os.ModePerm); err != nil { return errors.Wrap(err, "MkdirAll "+dirPath) } @@ -625,7 +634,7 @@ func (b *Bot) httpPkgZigIndexCached() ([]byte, error) { } // Fetch the latest upstream Zig index.json - resp, err := http.Get("https://ziglang.org/download/index.json") + resp, err := httpGet("https://ziglang.org/download/index.json", 60*time.Second) if err != nil { return nil, errors.Wrap(err, "fetching upstream https://ziglang.org/download/index.json") } @@ -637,7 +646,7 @@ func (b *Bot) httpPkgZigIndexCached() ([]byte, error) { // Fetch the Mach index.json which contains Mach nominated versions, but is otherwise not as // up-to-date as ziglang.org's version. - resp, err = http.Get("https://machengine.org/zig/index.json") + resp, err = httpGet("https://machengine.org/zig/index.json", 60*time.Second) if err != nil { return nil, errors.Wrap(err, "fetching mach https://machengine.org/zig/index.json") } @@ -703,3 +712,14 @@ func (b *Bot) httpPkgZigIndexCached() ([]byte, error) { return httpPkgZigIndexCached, nil } + +// Like http.Get, but actually respects a timeout instead of leaking a goroutine to forever run. +func httpGet(url string, timeout time.Duration) (*http.Response, error) { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + if err != nil { + return nil, err + } + return http.DefaultClient.Do(req) +}