From 1afd85353ccc5663b164ba12f787a927cbf318a1 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Fri, 15 Nov 2024 12:04:58 +0100 Subject: [PATCH 1/3] roachtest: get exec traces in gossip/chaos/nodes=9 This test is known for strange behavior (#134527). A recent test failure[^1] shows that we "lose time" in more unexpected locations. I'm assuming an execution trace (of the roachtest process) will be helpful here, as we'll be able to determine whether the time is spent in roachtest or on the CockroachDB server side. [^1]: https://github.com/cockroachdb/cockroach/issues/135154 Epic: none Release note: None --- pkg/cmd/roachtest/tests/gossip.go | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pkg/cmd/roachtest/tests/gossip.go b/pkg/cmd/roachtest/tests/gossip.go index c35090930d9f..570ea1c99155 100644 --- a/pkg/cmd/roachtest/tests/gossip.go +++ b/pkg/cmd/roachtest/tests/gossip.go @@ -12,6 +12,8 @@ import ( "net" "net/url" "os" + "path/filepath" + "runtime/trace" "sort" "strconv" "strings" @@ -54,8 +56,14 @@ SELECT node_id WHERE expiration > now(); ` + tBeforePing := timeutil.Now() db := c.Conn(ctx, t.L(), node) defer db.Close() + require.NoError(t, db.Ping()) + tAfterPing := timeutil.Now() + if pingDur := tAfterPing.Sub(tBeforePing); pingDur > 20*time.Second { + t.L().Printf("sql connection ready after %.2fs", pingDur.Seconds()) + } rows, err := db.Query(query) if err != nil { @@ -68,7 +76,11 @@ SELECT node_id require.NotZero(t, nodeID) nodes = append(nodes, nodeID) } + require.NoError(t, rows.Err()) sort.Ints(nodes) + if scanDur := timeutil.Since(tAfterPing); scanDur > 20*time.Second { + t.L().Printf("query processed after %.2fs", scanDur.Seconds()) + } return nodes } @@ -145,10 +157,24 @@ SELECT node_id waitForGossip(0) nodes := c.All() + for j := 0; j < 10; j++ { + traceFile := filepath.Join(t.ArtifactsDir(), "trace_"+strconv.Itoa(j)+".bin") + f, err := os.Create(traceFile) + require.NoError(t, err) + if err := trace.Start(f); err != nil { + _ = f.Close() + f = nil + _ = os.Remove(traceFile) + } deadNode := nodes.RandNode()[0] c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Node(deadNode)) waitForGossip(deadNode) + if f != nil { + trace.Stop() + _ = f.Close() + t.L().Printf("execution trace: %s", traceFile) + } c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Node(deadNode)) } } From ef5deaa7ccabacbc2d2b48838beacbf9ba1aa1c1 Mon Sep 17 00:00:00 2001 From: Mira Radeva Date: Thu, 14 Nov 2024 10:13:55 -0500 Subject: [PATCH 2/3] kv: add a backoff to the retry loop in db.Txn In rare cases (e.g. #77376), two transactions can get repeatedly deadlocked while trying to write to same key(s): one aborts the other, but before it can proceed, the other transaction has restarted and acquired a lock on the key again. This can result in the max transaction retries being exceeded without either transaction succeeding. This commit adds a backoff to the transaction retry loop in `db.Txn`, which will hopefully help one transaction slow down and let the other one commit. Fixes: #77376 Release note: None --- pkg/kv/txn.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pkg/kv/txn.go b/pkg/kv/txn.go index ee0082a3c941..b5ac7245f710 100644 --- a/pkg/kv/txn.go +++ b/pkg/kv/txn.go @@ -11,6 +11,7 @@ import ( "math" "time" + "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/isolation" @@ -23,6 +24,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/protoutil" + "github.com/cockroachdb/cockroach/pkg/util/retry" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/uuid" @@ -1045,7 +1047,10 @@ func (e *AutoCommitError) Error() string { func (txn *Txn) exec(ctx context.Context, fn func(context.Context, *Txn) error) (err error) { // Run fn in a retry loop until we encounter a success or // error condition this loop isn't capable of handling. - for attempt := 1; ; attempt++ { + retryOpts := base.DefaultRetryOptions() + retryOpts.InitialBackoff = 20 * time.Millisecond + retryOpts.MaxBackoff = 200 * time.Millisecond + for r := retry.Start(retryOpts); r.Next(); { if err := ctx.Err(); err != nil { return errors.Wrap(err, "txn exec") } @@ -1115,6 +1120,8 @@ func (txn *Txn) exec(ctx context.Context, fn func(context.Context, *Txn) error) // txn.db.ctx.Settings == nil is only expected in tests. maxRetries = int(MaxInternalTxnAutoRetries.Get(&txn.db.ctx.Settings.SV)) } + // Add 1 because r.CurrentAttempt() starts at 0. + attempt := r.CurrentAttempt() + 1 if attempt > maxRetries { // If the retries limit has been exceeded, rollback and return an error. rollbackErr := txn.Rollback(ctx) From 3ee09f14f62d1ba760b547e4e4283667f2a0cc14 Mon Sep 17 00:00:00 2001 From: Kyle Wong <37189875+kyle-a-wong@users.noreply.github.com> Date: Fri, 15 Nov 2024 15:54:08 -0500 Subject: [PATCH 3/3] roachtest: fix "context cancelled" errors in db-console tests changes the context used when writing cypress artifacts to the test artifact directory. This is needed because the existing context is getting cancelled, pressumable when `rtCluster.RunE(ctx...` fails. Adds `-e NO_COLOR=1` to the docker run command so that the output is more humanreadable in log files Updates the tests to use `registry.StandardCockroach`. By default, `registry.RandomizedCockroach` is used, and `registry.RuntimeAssertionsCockroach` is built using `cockroach-short`, which does not include db-console in the binary. Epic: none Release note: None --- pkg/cmd/roachtest/tests/db_console.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pkg/cmd/roachtest/tests/db_console.go b/pkg/cmd/roachtest/tests/db_console.go index c69888e04ac9..1e33874d20e3 100644 --- a/pkg/cmd/roachtest/tests/db_console.go +++ b/pkg/cmd/roachtest/tests/db_console.go @@ -114,7 +114,7 @@ func (d *dbConsoleCypressTest) RunTest(ctx context.Context, targetNode int, l *l url := fmt.Sprintf("https://%s", adminUIAddrs[0]) require.NoError(d.t, rtCluster.RunE(ctx, option.WithNodes(workloadNode), "mkdir", "-p", d.artifactPath)) dockerRun := fmt.Sprintf( - `docker run -v %s:/e2e/artifacts %s --config baseUrl=%s,screenshotsFolder=/e2e/artifacts,videosFolder=/e2e/artifacts %s`, + `docker run -e NO_COLOR=1 -v %s:/e2e/artifacts %s --config baseUrl=%s,screenshotsFolder=/e2e/artifacts,videosFolder=/e2e/artifacts %s`, d.artifactPath, d.imageName, url, specStr) // If the Docker run fails, get the test failure artifacts and write them to // roachtest's artifact directory. @@ -123,7 +123,7 @@ func (d *dbConsoleCypressTest) RunTest(ctx context.Context, targetNode int, l *l if mkDirErr := os.MkdirAll(testArtifactsDir, 0777); mkDirErr != nil { d.t.Fatal(mkDirErr) } - require.NoError(d.t, rtCluster.Get(ctx, d.t.L(), d.artifactPath, testArtifactsDir, workloadNode)) + require.NoError(d.t, rtCluster.Get(context.Background(), d.t.L(), d.artifactPath, testArtifactsDir, workloadNode)) d.t.Fatal(err) } } @@ -195,11 +195,16 @@ func (d *dbConsoleCypressTest) writeCypressFilesToWorkloadNode(ctx context.Conte } func registerDbConsole(r registry.Registry) { + // Explicitly set CockroachBinary to registry.StandardCockroach to ensure that a binary + // containing db console is used. Currently, registry.RuntimeAssertionsCockroach + // is built using cockroach-short and the default of registry.RandomizedCockroach + // causes the tests to be flaky r.Add(registry.TestSpec{ Name: "db-console/mixed-version-cypress", Owner: registry.OwnerObservability, Cluster: r.MakeClusterSpec(5, spec.WorkloadNode()), CompatibleClouds: registry.AllClouds, + CockroachBinary: registry.StandardCockroach, Suites: registry.Suites(registry.Nightly), Randomized: false, Run: runDbConsoleCypressMixedVersions, @@ -210,6 +215,7 @@ func registerDbConsole(r registry.Registry) { Owner: registry.OwnerObservability, Cluster: r.MakeClusterSpec(4, spec.WorkloadNode()), CompatibleClouds: registry.AllClouds, + CockroachBinary: registry.StandardCockroach, Suites: registry.Suites(registry.Nightly), Randomized: false, Run: runDbConsoleCypress,