Skip to content

Commit

Permalink
test(batcher): add failpoint-based test to trigger and test channelTi…
Browse files Browse the repository at this point in the history
…meout logic
  • Loading branch information
samlaf committed Dec 10, 2024
1 parent 0a8bbe4 commit a43ed4d
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 12 deletions.
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ require (
github.com/multiformats/go-multiaddr-dns v0.4.1
github.com/naoina/toml v0.1.2-0.20170918210437-9fafd6967416
github.com/olekukonko/tablewriter v0.0.5
github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86
github.com/pkg/errors v0.9.1
github.com/pkg/profile v1.7.0
github.com/prometheus/client_golang v1.20.5
Expand Down Expand Up @@ -184,6 +185,7 @@ require (
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect
github.com/peterh/liner v1.1.1-0.20190123174540-a2c9a5303de7 // indirect
github.com/pierrec/lz4 v2.6.1+incompatible // indirect
github.com/pingcap/errors v0.11.4 // indirect
github.com/pion/datachannel v1.5.8 // indirect
github.com/pion/dtls/v2 v2.2.12 // indirect
github.com/pion/ice/v2 v2.3.34 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -621,6 +621,8 @@ github.com/pierrec/lz4 v2.6.1+incompatible h1:9UY3+iC23yxF0UfGaYrGplQ+79Rg+h/q9F
github.com/pierrec/lz4 v2.6.1+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4=
github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8=
github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86 h1:tdMsjOqUR7YXHoBitzdebTvOjs/swniBTOLy5XiMtuE=
github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86/go.mod h1:exzhVYca3WRtd6gclGNErRWb1qEgff3LYta0LvRmON4=
github.com/pion/datachannel v1.5.8 h1:ph1P1NsGkazkjrvyMfhRBUAWMxugJjq2HfQifaOoSNo=
github.com/pion/datachannel v1.5.8/go.mod h1:PgmdpoaNBLX9HNzNClmdki4DYW5JtI7Yibu8QzbL3tI=
github.com/pion/dtls/v2 v2.2.7/go.mod h1:8WiMkebSHFD0T+dIU+UeBaoV7kDhOW5oDCzZ7WZ/F9s=
Expand Down
2 changes: 1 addition & 1 deletion just/go.just
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ go_fuzz FUZZ TIME='10s' PKG='': (go_test PKG _EXTRALDFLAGS "-fuzztime" TIME "-fu

[private]
go_generate SELECTOR *FLAGS:
go generate -v {{FLAGS}} {{SELECTOR}}
go generate -v {{FLAGS}} {{SELECTOR}}
1 change: 1 addition & 0 deletions mise.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ just = "1.37.0"
"go:github.com/vektra/mockery/v2" = "2.46.0"
"go:github.com/golangci/golangci-lint/cmd/golangci-lint" = "1.61.0"
"go:github.com/mikefarah/yq/v4" = "4.44.3"
"go:github.com/pingcap/failpoint/failpoint-toolexec" = "v0.0.0-20240528011301-b51a646c7c86"

# Python dependencies
"pipx:slither-analyzer" = "0.10.2"
Expand Down
10 changes: 10 additions & 0 deletions op-batcher/batcher/channel.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/ethereum-optimism/optimism/op-service/eth"
"github.com/ethereum/go-ethereum/core/types"
"github.com/ethereum/go-ethereum/log"
"github.com/pingcap/failpoint"
)

// channel is a lightweight wrapper around a ChannelBuilder which keeps track of pending
Expand Down Expand Up @@ -163,6 +164,12 @@ func (c *channel) Timeout() uint64 {
// A channel has timed out if the difference in L1 Inclusion blocks between
// the first & last included block is greater than or equal to the channel timeout.
func (c *channel) isTimedOut() bool {
// This is used in tests to inject timeouts. It is a noop when not in test mode.
// See op-batcher/justfile's test command for more info.
failpoint.Inject("channel.isTimedOut", func() {
c.log.Warn("channel.isTimedOut failpoint triggered, returning true")
failpoint.Return(true)
})
// Prior to the granite hard fork activating, the use of the shorter ChannelTimeout here may cause the batcher
// to believe the channel timed out when it was valid. It would then resubmit the blocks needlessly.
// This wastes batcher funds but doesn't cause any problems for the chain progressing safe head.
Expand All @@ -182,6 +189,9 @@ func (c *channel) ID() derive.ChannelID {
return c.channelBuilder.ID()
}

// NextAltDACommitment checks if it has already receives the altDA commitment
// of the txData whose first frame is altDAFrameCursor. If it has, it returns
// the txData and true. Otherwise, it returns an empty txData and false.
func (c *channel) NextAltDACommitment() (txData, bool) {
if txData, ok := c.altDACommitments[c.altDAFrameCursor]; ok {
if txData.altDACommitment == nil {
Expand Down
65 changes: 63 additions & 2 deletions op-batcher/batcher/driver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package batcher

import (
"context"
"encoding/binary"
"errors"
"fmt"
"math/big"
Expand All @@ -20,6 +21,7 @@ import (
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/core/types"
"github.com/ethereum/go-ethereum/log"
"github.com/pingcap/failpoint"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -296,8 +298,67 @@ func TestBatchSubmitter_AltDA_FailureCase2_FailedL1Tx(t *testing.T) {
require.Equal(t, uint64(4), fakeTxMgr.Nonce)
}

func TestBatchSubmitter_AltDA_FailureCase3_ChannelTimeout(t *testing.T) {
// TODO: implement this test
// FailpointTest, which can't be run normally, because it requires failpoint to be enabled.
// Run it via `just test-failpoint` from the op-batcher directory.
func TestBatchSubmitter_AltDA_FailureCase3_ChannelTimeout_FailpointTest(t *testing.T) {
t.Parallel()

// // Failpoint injects code in channel.isTimedOut to return true 50% of the time, for up to 4 times.
err := failpoint.Enable("github.com/ethereum-optimism/optimism/op-batcher/batcher/channel.isTimedOut", "50%4*return(true)")
require.NoError(t, err)
t.Cleanup(func() {
_ = failpoint.Disable("github.com/ethereum-optimism/optimism/op-batcher/batcher/channel.isTimedOut")
})

log := testlog.Logger(t, log.LevelInfo)
bs, ep, mockAltDAClient, fakeTxMgr := altDASetup(t, log)
L1Block0 := types.NewBlock(&types.Header{
Number: big.NewInt(0),
}, nil, nil, nil)
L1Block0Ref := eth.L1BlockRef{
Hash: L1Block0.Hash(),
Number: L1Block0.NumberU64(),
}
// We return incremental syncStatuses to force the op-batcher to entirely process each L2 block one by one.
// To test multi channel behavior, we could return a sync status that is multiple blocks ahead of the current L2 block.
// Removing the first 3 mock calls and only always returning a syncStatus on block 4 makes the batcher load all 4 blocks at once,
// and send them in parallel. However, the current channel timeout logic is not robust to this: https://github.com/ethereum-optimism/optimism/issues/13283
// TODO: After that issue is fixed, we can remove the first 3 mock calls here to test with more concurrent channels.
ep.rollupClient.Mock.On("SyncStatus").Times(10).Return(fakeSyncStatus(1, L1Block0Ref), nil)
ep.rollupClient.Mock.On("SyncStatus").Times(10).Return(fakeSyncStatus(2, L1Block0Ref), nil)
ep.rollupClient.Mock.On("SyncStatus").Times(10).Return(fakeSyncStatus(3, L1Block0Ref), nil)
ep.rollupClient.Mock.On("SyncStatus").Return(fakeSyncStatus(4, L1Block0Ref), nil)

L2Block0 := newMiniL2BlockWithNumberParent(1, big.NewInt(0), common.HexToHash("0x0"))
L2Block1 := newMiniL2BlockWithNumberParent(1, big.NewInt(1), L2Block0.Hash())
L2Block2 := newMiniL2BlockWithNumberParent(1, big.NewInt(2), L2Block1.Hash())
L2Block3 := newMiniL2BlockWithNumberParent(1, big.NewInt(3), L2Block2.Hash())
L2Block4 := newMiniL2BlockWithNumberParent(1, big.NewInt(4), L2Block3.Hash())

// L2block0 is the genesis block which is considered safe, so never loaded into the state.
ep.ethClient.Mock.On("BlockByNumber", big.NewInt(1)).Once().Return(L2Block1, nil)
ep.ethClient.Mock.On("BlockByNumber", big.NewInt(2)).Once().Return(L2Block2, nil)
ep.ethClient.Mock.On("BlockByNumber", big.NewInt(3)).Once().Return(L2Block3, nil)
ep.ethClient.Mock.On("BlockByNumber", big.NewInt(4)).Once().Return(L2Block4, nil)

err = bs.StartBatchSubmitting()
require.NoError(t, err)
time.Sleep(1 * time.Second) // 1 second is enough to process all blocks at 10ms poll interval
err = bs.StopBatchSubmitting(context.Background())
require.NoError(t, err)

log.Info("Number of commitments stored by the mockAltDAClient", "StoreCount", mockAltDAClient.StoreCount)
for i, txData := range fakeTxMgr.SuccessfullySentTxData {
// the mockAltDAClient uses counting commitments which take the last 2 bytes (bigEndian uint16),
// and the op-batcher adds the first 2 bytes for the version_byte and commitment_type.
// See https://specs.optimism.io/experimental/alt-da.html#input-commitment-submission
require.Equal(t, 4, len(txData))
// This is a very naive test, because we only check that the requests received by the fakeAltDAClient are sent to L1 in order.
// But this is neither necessary not sufficient. It works right now because the frames are sent in order to the altDAClient (see TODO above).
// But ultimately it is the frames that need to be sent in order, not the random commitment counts generate by the fakeAltDAClient.
// TODO: we should prob take the commitment and retrieve the frames from the fakeAltDAClient, and check that those were received in order somehow.
require.Equal(t, binary.BigEndian.Uint16(txData[2:4]), uint16(i), "altda commitments should be sent to L1 in order.")
}
}

func TestBatchSubmitter_AltDA_FailureCase4_FailedBlobSubmission(t *testing.T) {
Expand Down
8 changes: 7 additions & 1 deletion op-batcher/justfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@ clean:
rm -f {{BINARY}}

# Run tests
test: (go_test "./...")
test: test-failpoint
just go_test "./..."

# Failpoint tests rely on https://github.com/pingcap/failpoint, which are identified by a _FailpointTest suffix to the test name.
# We run them using a separate GOCACHE because they inject code into the compiled binaries, which could interfere with other tests or normal builds.
test-failpoint:
GOCACHE=/tmp/failpoint-cache just go_test "./batcher/..." "-toolexec" "$(mise which failpoint-toolexec)" "-run" ".*_FailpointTest$"

[private]
batcher_fuzz_task FUZZ TIME='10s': (go_fuzz FUZZ TIME "./batcher")
Expand Down
19 changes: 11 additions & 8 deletions op-service/testutils/fake_txmgr.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,22 @@ import (

// FakeTxMgr is a fake txmgr.TxManager for testing the op-batcher.
type FakeTxMgr struct {
log log.Logger
FromAddr common.Address
Closed bool
Nonce uint64
errorEveryNthSend uint // 0 means never error, 1 means every send errors, etc.
sendCount uint
log log.Logger
FromAddr common.Address
Closed bool
Nonce uint64
SuccessfullySentTxData [][]byte
errorEveryNthSend uint // 0 means never error, 1 means every send errors, etc.
sendCount uint
}

var _ txmgr.TxManager = (*FakeTxMgr)(nil)

func NewFakeTxMgr(log log.Logger, from common.Address) *FakeTxMgr {
return &FakeTxMgr{
log: log,
FromAddr: from,
log: log,
FromAddr: from,
SuccessfullySentTxData: make([][]byte, 0),
}
}

Expand All @@ -54,6 +56,7 @@ func (f *FakeTxMgr) SendAsync(ctx context.Context, candidate txmgr.TxCandidate,
}
sendResponse.Nonce = f.Nonce
f.Nonce++
f.SuccessfullySentTxData = append(f.SuccessfullySentTxData, candidate.TxData)
}
ch <- sendResponse
}
Expand Down

0 comments on commit a43ed4d

Please sign in to comment.