From 9bbbcae1a83eef806a75273c01c38b43426a3ac3 Mon Sep 17 00:00:00 2001 From: Thomas Gosteli Date: Fri, 1 Nov 2024 12:23:48 +0000 Subject: [PATCH 1/3] fix(defrag): handle no space left error Signed-off-by: Thomas Gosteli --- mvcc/backend/backend.go | 15 ++++++---- tests/e2e/defrag_no_space_test.go | 49 +++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 tests/e2e/defrag_no_space_test.go diff --git a/mvcc/backend/backend.go b/mvcc/backend/backend.go index 1d5586e9096..bf01d0175d2 100644 --- a/mvcc/backend/backend.go +++ b/mvcc/backend/backend.go @@ -27,8 +27,9 @@ import ( "github.com/coreos/pkg/capnslog" humanize "github.com/dustin/go-humanize" - bolt "go.etcd.io/bbolt" "go.uber.org/zap" + + bolt "go.etcd.io/bbolt" ) var ( @@ -449,10 +450,6 @@ func (b *backend) defrag() error { b.readTx.Lock() defer b.readTx.Unlock() - b.batchTx.unsafeCommit(true) - - b.batchTx.tx = nil - // Create a temporary file to ensure we start with a clean slate. // Snapshotter.cleanupSnapdir cleans up any of these that are found during startup. dir := filepath.Dir(b.db.Path()) @@ -460,11 +457,14 @@ func (b *backend) defrag() error { if err != nil { return err } + options := bolt.Options{} if boltOpenOptions != nil { options = *boltOpenOptions } options.OpenFile = func(path string, i int, mode os.FileMode) (file *os.File, err error) { + // gofail: var defragNoSpace string + // return nil, fmt.Errorf(defragNoSpace) return temp, nil } tdbp := temp.Name() @@ -485,6 +485,11 @@ func (b *backend) defrag() error { zap.String("current-db-size-in-use", humanize.Bytes(uint64(sizeInUse1))), ) } + + // Commit/stop and then reset current transactions (including the readTx) + b.batchTx.unsafeCommit(true) + b.batchTx.tx = nil + // gofail: var defragBeforeCopy struct{} err = defragdb(b.db, tmpdb, defragLimit) if err != nil { diff --git a/tests/e2e/defrag_no_space_test.go b/tests/e2e/defrag_no_space_test.go new file mode 100644 index 00000000000..810136f156e --- /dev/null +++ b/tests/e2e/defrag_no_space_test.go @@ -0,0 +1,49 @@ +// Copyright 2024 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package e2e + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "go.etcd.io/etcd/tests/v3/framework/config" + "go.etcd.io/etcd/tests/v3/framework/e2e" +) + +func TestDefragNoSpace(t *testing.T) { + e2e.BeforeTest(t) + + clus, err := e2e.NewEtcdProcessCluster(context.TODO(), t, + e2e.WithClusterSize(1), + e2e.WithGoFailEnabled(true), + ) + require.NoError(t, err) + t.Cleanup(func() { clus.Stop() }) + + member := clus.Procs[0] + + require.NoError(t, member.Failpoints().SetupHTTP(context.Background(), "defragNoSpace", `return("no space")`)) + require.ErrorContains(t, member.Etcdctl().Defragment(context.Background(), config.DefragOption{Timeout: time.Minute}), "no space") + + // Make sure etcd continues to run even after the failed defrag attempt + require.NoError(t, member.Etcdctl().Put(context.Background(), "foo", "bar", config.PutOptions{})) + value, err := member.Etcdctl().Get(context.Background(), "foo", config.GetOptions{}) + require.NoError(t, err) + require.Len(t, value.Kvs, 1) + require.Equal(t, "bar", string(value.Kvs[0].Value)) +} From f309a23387a49ec34ee15c0dfeedcc5772d76b98 Mon Sep 17 00:00:00 2001 From: Thomas Gosteli Date: Wed, 6 Nov 2024 11:47:18 +0100 Subject: [PATCH 2/3] fix(defrag): handle defragdb failure Signed-off-by: Thomas Gosteli --- mvcc/backend/backend.go | 12 +++++- tests/e2e/defrag_no_space_test.go | 62 +++++++++++++++++++++---------- 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/mvcc/backend/backend.go b/mvcc/backend/backend.go index bf01d0175d2..9d379d74e07 100644 --- a/mvcc/backend/backend.go +++ b/mvcc/backend/backend.go @@ -463,8 +463,8 @@ func (b *backend) defrag() error { options = *boltOpenOptions } options.OpenFile = func(path string, i int, mode os.FileMode) (file *os.File, err error) { - // gofail: var defragNoSpace string - // return nil, fmt.Errorf(defragNoSpace) + // gofail: var defragOpenFileError string + // return nil, fmt.Errorf(defragOpenFileError) return temp, nil } tdbp := temp.Name() @@ -501,6 +501,11 @@ func (b *backend) defrag() error { plog.Fatalf("failed to remove db.tmp after defragmentation completed: %v", rmErr) } } + + // restore the bbolt transactions if defragmentation fails + b.batchTx.tx = b.unsafeBegin(true) + b.readTx.tx = b.unsafeBegin(false) + return err } @@ -569,6 +574,9 @@ func (b *backend) defrag() error { } func defragdb(odb, tmpdb *bolt.DB, limit int) error { + // gofail: var defragdbFail string + // return fmt.Errorf(defragdbFail) + // open a tx on tmpdb for writes tmptx, err := tmpdb.Begin(true) if err != nil { diff --git a/tests/e2e/defrag_no_space_test.go b/tests/e2e/defrag_no_space_test.go index 810136f156e..f6ceabe667b 100644 --- a/tests/e2e/defrag_no_space_test.go +++ b/tests/e2e/defrag_no_space_test.go @@ -16,6 +16,7 @@ package e2e import ( "context" + "fmt" "testing" "time" @@ -26,24 +27,45 @@ import ( ) func TestDefragNoSpace(t *testing.T) { - e2e.BeforeTest(t) - - clus, err := e2e.NewEtcdProcessCluster(context.TODO(), t, - e2e.WithClusterSize(1), - e2e.WithGoFailEnabled(true), - ) - require.NoError(t, err) - t.Cleanup(func() { clus.Stop() }) - - member := clus.Procs[0] - - require.NoError(t, member.Failpoints().SetupHTTP(context.Background(), "defragNoSpace", `return("no space")`)) - require.ErrorContains(t, member.Etcdctl().Defragment(context.Background(), config.DefragOption{Timeout: time.Minute}), "no space") - - // Make sure etcd continues to run even after the failed defrag attempt - require.NoError(t, member.Etcdctl().Put(context.Background(), "foo", "bar", config.PutOptions{})) - value, err := member.Etcdctl().Get(context.Background(), "foo", config.GetOptions{}) - require.NoError(t, err) - require.Len(t, value.Kvs, 1) - require.Equal(t, "bar", string(value.Kvs[0].Value)) + tests := []struct { + name string + failpoint string + err string + }{ + { + name: "no space (#18810) - can't open/create new bbolt db", + failpoint: "defragOpenFileError", + err: "no space", + }, + { + name: "defragdb failure", + failpoint: "defragdbFail", + err: "some random error", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + e2e.BeforeTest(t) + + clus, err := e2e.NewEtcdProcessCluster(context.TODO(), t, + e2e.WithClusterSize(1), + e2e.WithGoFailEnabled(true), + ) + require.NoError(t, err) + t.Cleanup(func() { clus.Stop() }) + + member := clus.Procs[0] + + require.NoError(t, member.Failpoints().SetupHTTP(context.Background(), tc.failpoint, fmt.Sprintf(`return("%s")`, tc.err))) + require.ErrorContains(t, member.Etcdctl().Defragment(context.Background(), config.DefragOption{Timeout: time.Minute}), tc.err) + + // Make sure etcd continues to run even after the failed defrag attempt + require.NoError(t, member.Etcdctl().Put(context.Background(), "foo", "bar", config.PutOptions{})) + value, err := member.Etcdctl().Get(context.Background(), "foo", config.GetOptions{}) + require.NoError(t, err) + require.Len(t, value.Kvs, 1) + require.Equal(t, "bar", string(value.Kvs[0].Value)) + }) + } } From 77c7c8458958ff03470b705ed8a9ea92773bd48e Mon Sep 17 00:00:00 2001 From: Thomas Gosteli Date: Wed, 6 Nov 2024 14:37:58 +0100 Subject: [PATCH 3/3] chore(e2e): adapt defrag tests for 3.4 Signed-off-by: Thomas Gosteli --- tests/e2e/defrag_no_space_test.go | 23 +++++++++++------------ tests/e2e/etcdctl.go | 10 ++++++++++ 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/tests/e2e/defrag_no_space_test.go b/tests/e2e/defrag_no_space_test.go index f6ceabe667b..0cb75414957 100644 --- a/tests/e2e/defrag_no_space_test.go +++ b/tests/e2e/defrag_no_space_test.go @@ -21,9 +21,6 @@ import ( "time" "github.com/stretchr/testify/require" - - "go.etcd.io/etcd/tests/v3/framework/config" - "go.etcd.io/etcd/tests/v3/framework/e2e" ) func TestDefragNoSpace(t *testing.T) { @@ -46,23 +43,25 @@ func TestDefragNoSpace(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - e2e.BeforeTest(t) - - clus, err := e2e.NewEtcdProcessCluster(context.TODO(), t, - e2e.WithClusterSize(1), - e2e.WithGoFailEnabled(true), + clus, err := newEtcdProcessCluster(t, + &etcdProcessClusterConfig{ + clusterSize: 1, + debug: true, + goFailEnabled: true, + }, ) require.NoError(t, err) t.Cleanup(func() { clus.Stop() }) - member := clus.Procs[0] + member := clus.procs[0] + etcdctl := member.Etcdctl(clientNonTLS, false, false) require.NoError(t, member.Failpoints().SetupHTTP(context.Background(), tc.failpoint, fmt.Sprintf(`return("%s")`, tc.err))) - require.ErrorContains(t, member.Etcdctl().Defragment(context.Background(), config.DefragOption{Timeout: time.Minute}), tc.err) + require.ErrorContains(t, etcdctl.Defragment(time.Minute), tc.err) // Make sure etcd continues to run even after the failed defrag attempt - require.NoError(t, member.Etcdctl().Put(context.Background(), "foo", "bar", config.PutOptions{})) - value, err := member.Etcdctl().Get(context.Background(), "foo", config.GetOptions{}) + require.NoError(t, etcdctl.Put("foo", "bar")) + value, err := etcdctl.Get("foo") require.NoError(t, err) require.Len(t, value.Kvs, 1) require.Equal(t, "bar", string(value.Kvs[0].Value)) diff --git a/tests/e2e/etcdctl.go b/tests/e2e/etcdctl.go index cc5c7a31b6d..1b2df7b12d4 100644 --- a/tests/e2e/etcdctl.go +++ b/tests/e2e/etcdctl.go @@ -18,6 +18,7 @@ import ( "encoding/json" "fmt" "strings" + "time" "go.etcd.io/etcd/clientv3" ) @@ -141,6 +142,15 @@ func (ctl *Etcdctl) Compact(rev int64) (*clientv3.CompactResponse, error) { return nil, spawnWithExpect(args, fmt.Sprintf("compacted revision %v", rev)) } +func (ctl *Etcdctl) Defragment(timeout time.Duration) error { + args := append(ctl.cmdArgs(), "defrag") + if timeout != 0 { + args = append(args, fmt.Sprintf("--command-timeout=%s", timeout)) + } + + return spawnWithExpect(args, "Finished defragmenting etcd member") +} + func (ctl *Etcdctl) Status() ([]*clientv3.StatusResponse, error) { var epStatus []*struct { Endpoint string