Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
135173: kv: add a backoff to the retry loop in db.Txn r=miraradeva a=miraradeva

In rare cases (e.g. cockroachdb#77376), two transactions can get repeatedly deadlocked while trying to write to same key(s): one aborts the other, but before it can proceed, the other transaction has restarted and acquired a lock on the key again. This can result in the max transaction retries being exceeded without either transaction succeeding.

This commit adds a backoff to the transaction retry loop in `db.Txn`, which will hopefully help one transaction slow down and let the other one commit.

Fixes: cockroachdb#77376

Release note: None

135253: roachtest: get exec traces in gossip/chaos/nodes=9 r=tbg a=tbg

This test is known for strange behavior (cockroachdb#134527).
A recent test failure[^1] shows that we "lose time"
in more unexpected locations.

I'm assuming an execution trace (of the roachtest process) will be
helpful here, as we'll be able to determine whether the time is
spent in roachtest or on the CockroachDB server side.

[^1]: cockroachdb#135154

Epic: none
Release note: None


135333: roachtest: fix "context cancelled" errors in db-console tests r=kyle-a-wong a=kyle-a-wong

changes the context used when writing cypress artifacts to the test artifact directory. This is needed because the existing context is getting cancelled, pressumable when `rtCluster.RunE(ctx...` fails.

Adds `-e NO_COLOR=1` to the docker run command so that the output is more humanreadable in log files

Updates the tests to use `registry.StandardCockroach`. By default, `registry.RandomizedCockroach` is used, and `registry.RuntimeAssertionsCockroach` is built using `cockroach-short`, which does not include db-console in the binary.

Resolves: cockroachdb#134808
Epic: none
Release note: None

Co-authored-by: Mira Radeva <[email protected]>
Co-authored-by: Tobias Grieger <[email protected]>
Co-authored-by: Kyle Wong <[email protected]>
  • Loading branch information
4 people committed Nov 18, 2024
4 parents 5b88207 + ef5deaa + 1afd853 + 3ee09f1 commit 7087319
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 3 deletions.
10 changes: 8 additions & 2 deletions pkg/cmd/roachtest/tests/db_console.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ func (d *dbConsoleCypressTest) RunTest(ctx context.Context, targetNode int, l *l
url := fmt.Sprintf("https://%s", adminUIAddrs[0])
require.NoError(d.t, rtCluster.RunE(ctx, option.WithNodes(workloadNode), "mkdir", "-p", d.artifactPath))
dockerRun := fmt.Sprintf(
`docker run -v %s:/e2e/artifacts %s --config baseUrl=%s,screenshotsFolder=/e2e/artifacts,videosFolder=/e2e/artifacts %s`,
`docker run -e NO_COLOR=1 -v %s:/e2e/artifacts %s --config baseUrl=%s,screenshotsFolder=/e2e/artifacts,videosFolder=/e2e/artifacts %s`,
d.artifactPath, d.imageName, url, specStr)
// If the Docker run fails, get the test failure artifacts and write them to
// roachtest's artifact directory.
Expand All @@ -123,7 +123,7 @@ func (d *dbConsoleCypressTest) RunTest(ctx context.Context, targetNode int, l *l
if mkDirErr := os.MkdirAll(testArtifactsDir, 0777); mkDirErr != nil {
d.t.Fatal(mkDirErr)
}
require.NoError(d.t, rtCluster.Get(ctx, d.t.L(), d.artifactPath, testArtifactsDir, workloadNode))
require.NoError(d.t, rtCluster.Get(context.Background(), d.t.L(), d.artifactPath, testArtifactsDir, workloadNode))
d.t.Fatal(err)
}
}
Expand Down Expand Up @@ -195,11 +195,16 @@ func (d *dbConsoleCypressTest) writeCypressFilesToWorkloadNode(ctx context.Conte
}

func registerDbConsole(r registry.Registry) {
// Explicitly set CockroachBinary to registry.StandardCockroach to ensure that a binary
// containing db console is used. Currently, registry.RuntimeAssertionsCockroach
// is built using cockroach-short and the default of registry.RandomizedCockroach
// causes the tests to be flaky
r.Add(registry.TestSpec{
Name: "db-console/mixed-version-cypress",
Owner: registry.OwnerObservability,
Cluster: r.MakeClusterSpec(5, spec.WorkloadNode()),
CompatibleClouds: registry.AllClouds,
CockroachBinary: registry.StandardCockroach,
Suites: registry.Suites(registry.Nightly),
Randomized: false,
Run: runDbConsoleCypressMixedVersions,
Expand All @@ -210,6 +215,7 @@ func registerDbConsole(r registry.Registry) {
Owner: registry.OwnerObservability,
Cluster: r.MakeClusterSpec(4, spec.WorkloadNode()),
CompatibleClouds: registry.AllClouds,
CockroachBinary: registry.StandardCockroach,
Suites: registry.Suites(registry.Nightly),
Randomized: false,
Run: runDbConsoleCypress,
Expand Down
26 changes: 26 additions & 0 deletions pkg/cmd/roachtest/tests/gossip.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import (
"net"
"net/url"
"os"
"path/filepath"
"runtime/trace"
"sort"
"strconv"
"strings"
Expand Down Expand Up @@ -54,8 +56,14 @@ SELECT node_id
WHERE expiration > now();
`

tBeforePing := timeutil.Now()
db := c.Conn(ctx, t.L(), node)
defer db.Close()
require.NoError(t, db.Ping())
tAfterPing := timeutil.Now()
if pingDur := tAfterPing.Sub(tBeforePing); pingDur > 20*time.Second {
t.L().Printf("sql connection ready after %.2fs", pingDur.Seconds())
}

rows, err := db.Query(query)
if err != nil {
Expand All @@ -68,7 +76,11 @@ SELECT node_id
require.NotZero(t, nodeID)
nodes = append(nodes, nodeID)
}
require.NoError(t, rows.Err())
sort.Ints(nodes)
if scanDur := timeutil.Since(tAfterPing); scanDur > 20*time.Second {
t.L().Printf("query processed after %.2fs", scanDur.Seconds())
}
return nodes
}

Expand Down Expand Up @@ -145,10 +157,24 @@ SELECT node_id

waitForGossip(0)
nodes := c.All()

for j := 0; j < 10; j++ {
traceFile := filepath.Join(t.ArtifactsDir(), "trace_"+strconv.Itoa(j)+".bin")
f, err := os.Create(traceFile)
require.NoError(t, err)
if err := trace.Start(f); err != nil {
_ = f.Close()
f = nil
_ = os.Remove(traceFile)
}
deadNode := nodes.RandNode()[0]
c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Node(deadNode))
waitForGossip(deadNode)
if f != nil {
trace.Stop()
_ = f.Close()
t.L().Printf("execution trace: %s", traceFile)
}
c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Node(deadNode))
}
}
Expand Down
9 changes: 8 additions & 1 deletion pkg/kv/txn.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"math"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/isolation"
Expand All @@ -23,6 +24,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
Expand Down Expand Up @@ -1045,7 +1047,10 @@ func (e *AutoCommitError) Error() string {
func (txn *Txn) exec(ctx context.Context, fn func(context.Context, *Txn) error) (err error) {
// Run fn in a retry loop until we encounter a success or
// error condition this loop isn't capable of handling.
for attempt := 1; ; attempt++ {
retryOpts := base.DefaultRetryOptions()
retryOpts.InitialBackoff = 20 * time.Millisecond
retryOpts.MaxBackoff = 200 * time.Millisecond
for r := retry.Start(retryOpts); r.Next(); {
if err := ctx.Err(); err != nil {
return errors.Wrap(err, "txn exec")
}
Expand Down Expand Up @@ -1115,6 +1120,8 @@ func (txn *Txn) exec(ctx context.Context, fn func(context.Context, *Txn) error)
// txn.db.ctx.Settings == nil is only expected in tests.
maxRetries = int(MaxInternalTxnAutoRetries.Get(&txn.db.ctx.Settings.SV))
}
// Add 1 because r.CurrentAttempt() starts at 0.
attempt := r.CurrentAttempt() + 1
if attempt > maxRetries {
// If the retries limit has been exceeded, rollback and return an error.
rollbackErr := txn.Rollback(ctx)
Expand Down

0 comments on commit 7087319

Please sign in to comment.