test(batcher): add failpoint-based test to trigger and test channelTi…

…meout logic
Layr-Labs · Dec 10, 2024 · a43ed4d · a43ed4d
1 parent 0a8bbe4
commit a43ed4d
Show file tree

Hide file tree

Showing 8 changed files with 97 additions and 12 deletions.
diff --git a/go.mod b/go.mod
@@ -41,6 +41,7 @@ require (
 	github.com/multiformats/go-multiaddr-dns v0.4.1
 	github.com/naoina/toml v0.1.2-0.20170918210437-9fafd6967416
 	github.com/olekukonko/tablewriter v0.0.5
+	github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86
 	github.com/pkg/errors v0.9.1
 	github.com/pkg/profile v1.7.0
 	github.com/prometheus/client_golang v1.20.5
@@ -184,6 +185,7 @@ require (
 	github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect
 	github.com/peterh/liner v1.1.1-0.20190123174540-a2c9a5303de7 // indirect
 	github.com/pierrec/lz4 v2.6.1+incompatible // indirect
+	github.com/pingcap/errors v0.11.4 // indirect
 	github.com/pion/datachannel v1.5.8 // indirect
 	github.com/pion/dtls/v2 v2.2.12 // indirect
 	github.com/pion/ice/v2 v2.3.34 // indirect

diff --git a/go.sum b/go.sum
@@ -621,6 +621,8 @@ github.com/pierrec/lz4 v2.6.1+incompatible h1:9UY3+iC23yxF0UfGaYrGplQ+79Rg+h/q9F
 github.com/pierrec/lz4 v2.6.1+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
 github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4=
 github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8=
+github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86 h1:tdMsjOqUR7YXHoBitzdebTvOjs/swniBTOLy5XiMtuE=
+github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86/go.mod h1:exzhVYca3WRtd6gclGNErRWb1qEgff3LYta0LvRmON4=
 github.com/pion/datachannel v1.5.8 h1:ph1P1NsGkazkjrvyMfhRBUAWMxugJjq2HfQifaOoSNo=
 github.com/pion/datachannel v1.5.8/go.mod h1:PgmdpoaNBLX9HNzNClmdki4DYW5JtI7Yibu8QzbL3tI=
 github.com/pion/dtls/v2 v2.2.7/go.mod h1:8WiMkebSHFD0T+dIU+UeBaoV7kDhOW5oDCzZ7WZ/F9s=

diff --git a/just/go.just b/just/go.just
@@ -28,4 +28,4 @@ go_fuzz FUZZ TIME='10s' PKG='': (go_test PKG _EXTRALDFLAGS "-fuzztime" TIME "-fu
 
 [private]
 go_generate SELECTOR *FLAGS:
-    go generate -v {{FLAGS}} {{SELECTOR}}
+    go generate -v {{FLAGS}} {{SELECTOR}}
diff --git a/mise.toml b/mise.toml
@@ -23,6 +23,7 @@ just = "1.37.0"
 "go:github.com/vektra/mockery/v2" = "2.46.0"
 "go:github.com/golangci/golangci-lint/cmd/golangci-lint" = "1.61.0"
 "go:github.com/mikefarah/yq/v4" = "4.44.3"
+"go:github.com/pingcap/failpoint/failpoint-toolexec" = "v0.0.0-20240528011301-b51a646c7c86"
 
 # Python dependencies
 "pipx:slither-analyzer" = "0.10.2"

diff --git a/op-batcher/batcher/channel.go b/op-batcher/batcher/channel.go
@@ -10,6 +10,7 @@ import (
 	"github.com/ethereum-optimism/optimism/op-service/eth"
 	"github.com/ethereum/go-ethereum/core/types"
 	"github.com/ethereum/go-ethereum/log"
+	"github.com/pingcap/failpoint"
 )
 
 // channel is a lightweight wrapper around a ChannelBuilder which keeps track of pending
@@ -163,6 +164,12 @@ func (c *channel) Timeout() uint64 {
 // A channel has timed out if the difference in L1 Inclusion blocks between
 // the first & last included block is greater than or equal to the channel timeout.
 func (c *channel) isTimedOut() bool {
+	// This is used in tests to inject timeouts. It is a noop when not in test mode.
+	// See op-batcher/justfile's test command for more info.
+	failpoint.Inject("channel.isTimedOut", func() {
+		c.log.Warn("channel.isTimedOut failpoint triggered, returning true")
+		failpoint.Return(true)
+	})
 	// Prior to the granite hard fork activating, the use of the shorter ChannelTimeout here may cause the batcher
 	// to believe the channel timed out when it was valid. It would then resubmit the blocks needlessly.
 	// This wastes batcher funds but doesn't cause any problems for the chain progressing safe head.
@@ -182,6 +189,9 @@ func (c *channel) ID() derive.ChannelID {
 	return c.channelBuilder.ID()
 }
 
+// NextAltDACommitment checks if it has already receives the altDA commitment
+// of the txData whose first frame is altDAFrameCursor. If it has, it returns
+// the txData and true. Otherwise, it returns an empty txData and false.
 func (c *channel) NextAltDACommitment() (txData, bool) {
 	if txData, ok := c.altDACommitments[c.altDAFrameCursor]; ok {
 		if txData.altDACommitment == nil {

diff --git a/op-batcher/batcher/driver_test.go b/op-batcher/batcher/driver_test.go
@@ -2,6 +2,7 @@ package batcher
 
 import (
 	"context"
+	"encoding/binary"
 	"errors"
 	"fmt"
 	"math/big"
@@ -20,6 +21,7 @@ import (
 	"github.com/ethereum/go-ethereum/common"
 	"github.com/ethereum/go-ethereum/core/types"
 	"github.com/ethereum/go-ethereum/log"
+	"github.com/pingcap/failpoint"
 	"github.com/stretchr/testify/require"
 )
 
@@ -296,8 +298,67 @@ func TestBatchSubmitter_AltDA_FailureCase2_FailedL1Tx(t *testing.T) {
 	require.Equal(t, uint64(4), fakeTxMgr.Nonce)
 }
 
-func TestBatchSubmitter_AltDA_FailureCase3_ChannelTimeout(t *testing.T) {
-	// TODO: implement this test
+// FailpointTest, which can't be run normally, because it requires failpoint to be enabled.
+// Run it via `just test-failpoint` from the op-batcher directory.
+func TestBatchSubmitter_AltDA_FailureCase3_ChannelTimeout_FailpointTest(t *testing.T) {
+	t.Parallel()
+
+	// // Failpoint injects code in channel.isTimedOut to return true 50% of the time, for up to 4 times.
+	err := failpoint.Enable("github.com/ethereum-optimism/optimism/op-batcher/batcher/channel.isTimedOut", "50%4*return(true)")
+	require.NoError(t, err)
+	t.Cleanup(func() {
+		_ = failpoint.Disable("github.com/ethereum-optimism/optimism/op-batcher/batcher/channel.isTimedOut")
+	})
+
+	log := testlog.Logger(t, log.LevelInfo)
+	bs, ep, mockAltDAClient, fakeTxMgr := altDASetup(t, log)
+	L1Block0 := types.NewBlock(&types.Header{
+		Number: big.NewInt(0),
+	}, nil, nil, nil)
+	L1Block0Ref := eth.L1BlockRef{
+		Hash:   L1Block0.Hash(),
+		Number: L1Block0.NumberU64(),
+	}
+	// We return incremental syncStatuses to force the op-batcher to entirely process each L2 block one by one.
+	// To test multi channel behavior, we could return a sync status that is multiple blocks ahead of the current L2 block.
+	// Removing the first 3 mock calls and only always returning a syncStatus on block 4 makes the batcher load all 4 blocks at once,
+	// and send them in parallel. However, the current channel timeout logic is not robust to this: https://github.com/ethereum-optimism/optimism/issues/13283
+	// TODO: After that issue is fixed, we can remove the first 3 mock calls here to test with more concurrent channels.
+	ep.rollupClient.Mock.On("SyncStatus").Times(10).Return(fakeSyncStatus(1, L1Block0Ref), nil)
+	ep.rollupClient.Mock.On("SyncStatus").Times(10).Return(fakeSyncStatus(2, L1Block0Ref), nil)
+	ep.rollupClient.Mock.On("SyncStatus").Times(10).Return(fakeSyncStatus(3, L1Block0Ref), nil)
+	ep.rollupClient.Mock.On("SyncStatus").Return(fakeSyncStatus(4, L1Block0Ref), nil)
+
+	L2Block0 := newMiniL2BlockWithNumberParent(1, big.NewInt(0), common.HexToHash("0x0"))
+	L2Block1 := newMiniL2BlockWithNumberParent(1, big.NewInt(1), L2Block0.Hash())
+	L2Block2 := newMiniL2BlockWithNumberParent(1, big.NewInt(2), L2Block1.Hash())
+	L2Block3 := newMiniL2BlockWithNumberParent(1, big.NewInt(3), L2Block2.Hash())
+	L2Block4 := newMiniL2BlockWithNumberParent(1, big.NewInt(4), L2Block3.Hash())
+
+	// L2block0 is the genesis block which is considered safe, so never loaded into the state.
+	ep.ethClient.Mock.On("BlockByNumber", big.NewInt(1)).Once().Return(L2Block1, nil)
+	ep.ethClient.Mock.On("BlockByNumber", big.NewInt(2)).Once().Return(L2Block2, nil)
+	ep.ethClient.Mock.On("BlockByNumber", big.NewInt(3)).Once().Return(L2Block3, nil)
+	ep.ethClient.Mock.On("BlockByNumber", big.NewInt(4)).Once().Return(L2Block4, nil)
+
+	err = bs.StartBatchSubmitting()
+	require.NoError(t, err)
+	time.Sleep(1 * time.Second) // 1 second is enough to process all blocks at 10ms poll interval
+	err = bs.StopBatchSubmitting(context.Background())
+	require.NoError(t, err)
+
+	log.Info("Number of commitments stored by the mockAltDAClient", "StoreCount", mockAltDAClient.StoreCount)
+	for i, txData := range fakeTxMgr.SuccessfullySentTxData {
+		// the mockAltDAClient uses counting commitments which take the last 2 bytes (bigEndian uint16),
+		// and the op-batcher adds the first 2 bytes for the version_byte and commitment_type.
+		// See https://specs.optimism.io/experimental/alt-da.html#input-commitment-submission
+		require.Equal(t, 4, len(txData))
+		// This is a very naive test, because we only check that the requests received by the fakeAltDAClient are sent to L1 in order.
+		// But this is neither necessary not sufficient. It works right now because the frames are sent in order to the altDAClient (see TODO above).
+		// But ultimately it is the frames that need to be sent in order, not the random commitment counts generate by the fakeAltDAClient.
+		// TODO: we should prob take the commitment and retrieve the frames from the fakeAltDAClient, and check that those were received in order somehow.
+		require.Equal(t, binary.BigEndian.Uint16(txData[2:4]), uint16(i), "altda commitments should be sent to L1 in order.")
+	}
 }
 
 func TestBatchSubmitter_AltDA_FailureCase4_FailedBlobSubmission(t *testing.T) {

diff --git a/op-batcher/justfile b/op-batcher/justfile
@@ -17,7 +17,13 @@ clean:
     rm -f {{BINARY}}
 
 # Run tests
-test: (go_test "./...")
+test: test-failpoint
+    just go_test "./..."
+
+# Failpoint tests rely on https://github.com/pingcap/failpoint, which are identified by a _FailpointTest suffix to the test name.
+# We run them using a separate GOCACHE because they inject code into the compiled binaries, which could interfere with other tests or normal builds.
+test-failpoint:
+    GOCACHE=/tmp/failpoint-cache just go_test "./batcher/..." "-toolexec" "$(mise which failpoint-toolexec)" "-run" ".*_FailpointTest$"
 
 [private]
 batcher_fuzz_task FUZZ TIME='10s': (go_fuzz FUZZ TIME "./batcher")

diff --git a/op-service/testutils/fake_txmgr.go b/op-service/testutils/fake_txmgr.go
@@ -14,20 +14,22 @@ import (
 
 // FakeTxMgr is a fake txmgr.TxManager for testing the op-batcher.
 type FakeTxMgr struct {
-	log               log.Logger
-	FromAddr          common.Address
-	Closed            bool
-	Nonce             uint64
-	errorEveryNthSend uint // 0 means never error, 1 means every send errors, etc.
-	sendCount         uint
+	log                    log.Logger
+	FromAddr               common.Address
+	Closed                 bool
+	Nonce                  uint64
+	SuccessfullySentTxData [][]byte
+	errorEveryNthSend      uint // 0 means never error, 1 means every send errors, etc.
+	sendCount              uint
 }
 
 var _ txmgr.TxManager = (*FakeTxMgr)(nil)
 
 func NewFakeTxMgr(log log.Logger, from common.Address) *FakeTxMgr {
 	return &FakeTxMgr{
-		log:      log,
-		FromAddr: from,
+		log:                    log,
+		FromAddr:               from,
+		SuccessfullySentTxData: make([][]byte, 0),
 	}
 }
 
@@ -54,6 +56,7 @@ func (f *FakeTxMgr) SendAsync(ctx context.Context, candidate txmgr.TxCandidate,
 		}
 		sendResponse.Nonce = f.Nonce
 		f.Nonce++
+		f.SuccessfullySentTxData = append(f.SuccessfullySentTxData, candidate.TxData)
 	}
 	ch <- sendResponse
 }