From 0b0258a54ffa3ef3e0e5a36c0d5eb81e63ceb1fb Mon Sep 17 00:00:00 2001 From: Shuo Wu Date: Tue, 24 Dec 2024 00:37:52 -0800 Subject: [PATCH] feat: introduce snapshot checksum Longhorn 8666, 9488 Signed-off-by: Shuo Wu --- pkg/api/types.go | 3 +++ pkg/spdk/replica.go | 39 ++++++++++++++++++++++++++++++++--- pkg/spdk/types.go | 3 +++ pkg/spdk_test.go | 50 ++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 89 insertions(+), 6 deletions(-) diff --git a/pkg/api/types.go b/pkg/api/types.go index 43fa65ac..d78c0ea7 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -40,6 +40,7 @@ type Lvol struct { CreationTime string `json:"creation_time"` UserCreated bool `json:"user_created"` SnapshotTimestamp string `json:"snapshot_timestamp"` + SnapshotChecksum string `json:"snapshot_checksum"` } func ProtoLvolToLvol(l *spdkrpc.Lvol) *Lvol { @@ -56,6 +57,7 @@ func ProtoLvolToLvol(l *spdkrpc.Lvol) *Lvol { CreationTime: l.CreationTime, UserCreated: l.UserCreated, SnapshotTimestamp: l.SnapshotTimestamp, + SnapshotChecksum: l.SnapshotChecksum, } } @@ -73,6 +75,7 @@ func LvolToProtoLvol(l *Lvol) *spdkrpc.Lvol { CreationTime: l.CreationTime, UserCreated: l.UserCreated, SnapshotTimestamp: l.SnapshotTimestamp, + SnapshotChecksum: l.SnapshotChecksum, } } diff --git a/pkg/spdk/replica.go b/pkg/spdk/replica.go index c54a5998..a85c6029 100644 --- a/pkg/spdk/replica.go +++ b/pkg/spdk/replica.go @@ -67,7 +67,8 @@ type Replica struct { State types.InstanceState ErrorMsg string - IsExposed bool + IsExposed bool + SnapshotChecksumEnabled bool // constructRequired will be set to true when stopping an errored replica constructRequired bool @@ -190,6 +191,8 @@ func NewReplica(ctx context.Context, replicaName, lvsName, lvsUUID string, specS SpecSize: roundedSpecSize, State: types.InstanceStatePending, + SnapshotChecksumEnabled: true, + rebuildingDstCache: RebuildingDstCache{ rebuildingSnapshotMap: map[string]*api.Lvol{}, processedSnapshotList: []string{}, @@ -230,6 +233,25 @@ func (r *Replica) Sync(spdkClient *spdkclient.Client) (err error) { return err } + if r.SnapshotChecksumEnabled { + for _, bdevLvol := range bdevLvolMap { + if !bdevLvol.DriverSpecific.Lvol.Snapshot { + continue + } + if bdevLvol.DriverSpecific.Lvol.Xattrs[spdkclient.SnapshotChecksum] != "" { + continue + } + // TODO: Use a goroutine pool + go func() { + logrus.Debugf("Replica %v is registering checksum for snapshot %v", r.Name, bdevLvol.Aliases[0]) + _, err := spdkClient.BdevLvolRegisterSnapshotChecksum(bdevLvol.Aliases[0]) + if err != nil { + logrus.Errorf("Replica %v failed to register checksum for snapshot %v: %v", r.Name, bdevLvol.Name, err) + } + }() + } + } + if r.State == types.InstanceStatePending { return r.construct(bdevLvolMap) } @@ -442,6 +464,16 @@ func compareSvcLvols(prev, cur *Lvol, checkChildren, checkActualSize bool) error logrus.Warnf("Found mismatching lvol actual size %v with recorded prev lvol actual size %v when validating lvol %s", cur.ActualSize, prev.ActualSize, prev.Name) } + if prev.SnapshotChecksum == "" { + prev.SnapshotChecksum = cur.SnapshotChecksum + } + if cur.SnapshotChecksum == "" { + prev.SnapshotChecksum = "" + } + if prev.SnapshotChecksum != cur.SnapshotChecksum { + return fmt.Errorf("found mismatching lvol snapshot checksum %v with recorded prev lvol snapshot checksum %v when validating lvol %s", cur.SnapshotChecksum, prev.SnapshotChecksum, prev.Name) + } + return nil } @@ -1086,6 +1118,7 @@ func (r *Replica) SnapshotDelete(spdkClient *spdkclient.Client, snapshotName str return nil, fmt.Errorf("failed to get the bdev of the only child lvol %s after snapshot %s delete", childSvcLvol.Name, snapshotName) } childSvcLvol.ActualSize = bdevLvol[0].DriverSpecific.Lvol.NumAllocatedClusters * defaultClusterSize + childSvcLvol.SnapshotChecksum = "" } updateRequired = true @@ -1968,9 +2001,9 @@ func (r *Replica) rebuildingDstShallowCopyPrepare(spdkClient *spdkclient.Client, dstSnapBdevLvol := bdevLvolMap[dstSnapshotLvolName] snaplvolSnapshotTimestamp := dstSnapBdevLvol.DriverSpecific.Lvol.Xattrs[spdkclient.SnapshotTimestamp] snaplvolActualSize := dstSnapBdevLvol.DriverSpecific.Lvol.NumAllocatedClusters * defaultClusterSize - // TODO: Verify the checksum isIntactSnap := srcSnapSvcLvol.SnapshotTimestamp == snaplvolSnapshotTimestamp && - srcSnapSvcLvol.ActualSize == snaplvolActualSize + srcSnapSvcLvol.ActualSize == snaplvolActualSize && + srcSnapSvcLvol.SnapshotChecksum != "" && srcSnapSvcLvol.SnapshotChecksum == dstSnapBdevLvol.DriverSpecific.Lvol.Xattrs[spdkclient.SnapshotChecksum] // For now directly delete the corrupted or outdated snapshot lvol and start a full shallow copy since we cannot validate existing data during the shallow copy if !isIntactSnap { for _, childLvolName := range dstSnapBdevLvol.DriverSpecific.Lvol.Clones { diff --git a/pkg/spdk/types.go b/pkg/spdk/types.go index 1a4b731e..6436ad45 100644 --- a/pkg/spdk/types.go +++ b/pkg/spdk/types.go @@ -76,6 +76,7 @@ type Lvol struct { CreationTime string UserCreated bool SnapshotTimestamp string + SnapshotChecksum string } func ServiceBackingImageLvolToProtoBackingImageLvol(lvol *Lvol) *spdkrpc.Lvol { @@ -117,6 +118,7 @@ func ServiceLvolToProtoLvol(replicaName string, lvol *Lvol) *spdkrpc.Lvol { CreationTime: lvol.CreationTime, UserCreated: lvol.UserCreated, SnapshotTimestamp: lvol.SnapshotTimestamp, + SnapshotChecksum: lvol.SnapshotChecksum, } if lvol.Name == replicaName { @@ -150,6 +152,7 @@ func BdevLvolInfoToServiceLvol(bdev *spdktypes.BdevInfo) *Lvol { CreationTime: bdev.CreationTime, UserCreated: bdev.DriverSpecific.Lvol.Xattrs[spdkclient.UserCreated] == strconv.FormatBool(true), SnapshotTimestamp: bdev.DriverSpecific.Lvol.Xattrs[spdkclient.SnapshotTimestamp], + SnapshotChecksum: bdev.DriverSpecific.Lvol.Xattrs[spdkclient.SnapshotChecksum], } } diff --git a/pkg/spdk_test.go b/pkg/spdk_test.go index db54253f..9452e533 100644 --- a/pkg/spdk_test.go +++ b/pkg/spdk_test.go @@ -59,8 +59,10 @@ var ( defaultTestExecuteTimeout = 10 * time.Second - defaultTestRebuildingWaitInterval = 3 * time.Second - defaultTestRebuildingWaitCount = 60 + defaultTestRebuildingWaitInterval = 3 * time.Second + defaultTestRebuildingWaitCount = 60 + defaultTestSnapChecksumWaitInterval = 1 * time.Second + defaultTestSnapChecksumWaitCount = 60 ) func Test(t *testing.T) { TestingT(t) } @@ -1382,7 +1384,10 @@ func (s *TestSuite) TestSPDKMultipleThreadFastRebuilding(c *C) { }, nil) - // Test online rebuilding twice + waitReplicaSnapshotChecksum(c, spdkCli, replicaName1, "") + waitReplicaSnapshotChecksum(c, spdkCli, replicaName2, "") + + // Test online rebuilding // Crash replica1 err = spdkCli.ReplicaDelete(replicaName1, false) @@ -1616,6 +1621,45 @@ func checkReplicaSnapshots(c *C, spdkCli *client.SPDKClient, engineName string, } } +func waitReplicaSnapshotChecksum(c *C, spdkCli *client.SPDKClient, replicaName, targetSnapName string) { + waitReplicaSnapshotChecksumTimeout(c, spdkCli, replicaName, targetSnapName, defaultTestSnapChecksumWaitCount) +} + +func waitReplicaSnapshotChecksumTimeout(c *C, spdkCli *client.SPDKClient, replicaName, targetSnapName string, timeoutInSecond int) { + ticker := time.NewTicker(defaultTestSnapChecksumWaitInterval) + defer ticker.Stop() + timer := time.NewTimer(time.Duration(timeoutInSecond) * time.Second) + defer timer.Stop() + + hasChecksum := true + for { + hasChecksum = true + select { + case <-timer.C: + c.Assert(hasChecksum, Equals, true) + return + case <-ticker.C: + replica, err := spdkCli.ReplicaGet(replicaName) + c.Assert(err, IsNil) + if targetSnapName == "" || replica.Snapshots[targetSnapName] != nil { + for snapName, snap := range replica.Snapshots { + if targetSnapName == "" || snapName == targetSnapName { + if snap.SnapshotChecksum == "" { + hasChecksum = false + break + } + } + } + } + } + if hasChecksum { + break + } + } + + c.Assert(hasChecksum, Equals, true) +} + func revertSnapshot(c *C, spdkCli *client.SPDKClient, snapshotName, volumeName, engineName string, replicaAddressMap map[string]string) { ip, err := commonnet.GetAnyExternalIP() c.Assert(err, IsNil)