From 747609b0f5d3adff694f3795ff04cefe2b7b3504 Mon Sep 17 00:00:00 2001 From: Takeshi Yoneda Date: Fri, 7 Jun 2024 11:42:45 -0700 Subject: [PATCH] ssa: removes map use for block traversals (#2235) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This removes the use of map in basic block traversals. As a result, overall compilation perf improves like the below: ### Zig ``` goos: darwin goarch: arm64 pkg: github.com/tetratelabs/wazero/internal/integration_test/stdlibs │ old_zig.txt │ new_zig.txt │ │ sec/op │ sec/op vs base │ Zig/Compile/test-opt.wasm-10 4.438 ± 1% 3.778 ± 0% -14.87% (p=0.002 n=6) Zig/Run/test-opt.wasm-10 18.77 ± 1% 18.76 ± 0% ~ (p=0.818 n=6) Zig/Compile/test.wasm-10 5.083 ± 0% 4.673 ± 0% -8.07% (p=0.002 n=6) Zig/Run/test.wasm-10 19.27 ± 1% 19.30 ± 1% ~ (p=0.699 n=6) geomean 9.504 8.941 -5.92% │ old_zig.txt │ new_zig.txt │ │ B/op │ B/op vs base │ Zig/Compile/test-opt.wasm-10 396.7Mi ± 0% 394.7Mi ± 0% -0.51% (p=0.002 n=6) Zig/Run/test-opt.wasm-10 741.7Mi ± 0% 741.7Mi ± 0% ~ (p=0.900 n=6) Zig/Compile/test.wasm-10 660.0Mi ± 0% 659.5Mi ± 0% -0.08% (p=0.002 n=6) Zig/Run/test.wasm-10 1.296Gi ± 0% 1.296Gi ± 0% ~ (p=0.892 n=6) geomean 712.6Mi 711.5Mi -0.15% │ old_zig.txt │ new_zig.txt │ │ allocs/op │ allocs/op vs base │ Zig/Compile/test-opt.wasm-10 363.2k ± 0% 362.6k ± 0% -0.17% (p=0.002 n=6) Zig/Run/test-opt.wasm-10 51.58k ± 0% 51.58k ± 0% ~ (p=0.933 n=6) Zig/Compile/test.wasm-10 515.2k ± 0% 515.4k ± 0% ~ (p=0.485 n=6) Zig/Run/test.wasm-10 2.156M ± 0% 2.156M ± 0% ~ (p=0.998 n=6) geomean 379.8k 379.7k -0.03% ``` ### wasip1 ``` goos: darwin goarch: arm64 pkg: github.com/tetratelabs/wazero/internal/integration_test/stdlibs │ old_wasip1.txt │ new_wasip1.txt │ │ sec/op │ sec/op vs base │ Wasip1/Compile/src_archive_tar.test-10 2.198 ± 1% 2.067 ± 1% -5.96% (p=0.001 n=7) Wasip1/Run/src_archive_tar.test-10 398.8m ± 0% 398.8m ± 0% ~ (p=0.902 n=7) Wasip1/Compile/src_bufio.test-10 1.492 ± 0% 1.409 ± 1% -5.57% (p=0.001 n=7) Wasip1/Run/src_bufio.test-10 120.5m ± 1% 121.0m ± 1% +0.44% (p=0.017 n=7) Wasip1/Compile/src_bytes.test-10 1.543 ± 0% 1.454 ± 0% -5.72% (p=0.001 n=7) Wasip1/Run/src_bytes.test-10 469.0m ± 1% 467.4m ± 1% ~ (p=0.209 n=7) Wasip1/Compile/src_context.test-10 1.664 ± 0% 1.564 ± 1% -6.00% (p=0.001 n=7) Wasip1/Run/src_context.test-10 31.54m ± 1% 31.57m ± 0% ~ (p=0.445 n=6+7) Wasip1/Compile/src_encoding_ascii85.test-10 1.261 ± ∞ ¹ geomean 527.3m 565.9m -2.92% ¹ need >= 6 samples for confidence interval at level 0.95 │ old_wasip1.txt │ new_wasip1.txt │ │ B/op │ B/op vs base │ Wasip1/Compile/src_archive_tar.test-10 93.44Mi ± 0% 93.17Mi ± 0% -0.30% (p=0.001 n=7) Wasip1/Run/src_archive_tar.test-10 286.0Mi ± 0% 286.0Mi ± 0% ~ (p=0.593 n=7) Wasip1/Compile/src_bufio.test-10 74.38Mi ± 0% 74.13Mi ± 0% -0.35% (p=0.001 n=7) Wasip1/Run/src_bufio.test-10 105.3Mi ± 0% 105.3Mi ± 0% ~ (p=0.780 n=7) Wasip1/Compile/src_bytes.test-10 75.58Mi ± 0% 75.32Mi ± 0% -0.35% (p=0.001 n=7) Wasip1/Run/src_bytes.test-10 605.0Mi ± 0% 605.0Mi ± 0% ~ (p=0.331 n=7) Wasip1/Compile/src_context.test-10 78.33Mi ± 0% 78.07Mi ± 0% -0.33% (p=0.001 n=7) Wasip1/Run/src_context.test-10 71.52Mi ± 0% 71.52Mi ± 0% ~ (p=1.000 n=6+7) Wasip1/Compile/src_encoding_ascii85.test-10 70.38Mi ± ∞ ¹ geomean 123.4Mi 115.7Mi -0.17% ¹ need >= 6 samples for confidence interval at level 0.95 │ old_wasip1.txt │ new_wasip1.txt │ │ allocs/op │ allocs/op vs base │ Wasip1/Compile/src_archive_tar.test-10 265.4k ± 0% 265.0k ± 0% -0.16% (p=0.001 n=7) Wasip1/Run/src_archive_tar.test-10 7.831k ± 0% 7.830k ± 0% ~ (p=1.000 n=7) Wasip1/Compile/src_bufio.test-10 195.6k ± 0% 195.4k ± 0% -0.12% (p=0.001 n=7) Wasip1/Run/src_bufio.test-10 3.728k ± 0% 3.728k ± 0% ~ (p=1.000 n=7) ¹ Wasip1/Compile/src_bytes.test-10 204.1k ± 0% 203.7k ± 0% -0.20% (p=0.001 n=7) Wasip1/Run/src_bytes.test-10 6.377k ± 0% 6.377k ± 0% ~ (p=1.000 n=7) Wasip1/Compile/src_context.test-10 221.7k ± 0% 221.6k ± 0% -0.06% (p=0.001 n=7) Wasip1/Run/src_context.test-10 3.814k ± 0% 3.814k ± 1% ~ (p=0.140 n=6+7) Wasip1/Compile/src_encoding_ascii85.test-10 182.3k ± ∞ ² geomean 33.71k 40.64k -0.07% ¹ all samples are equal ² need >= 6 samples for confidence interval at level 0.95 ``` ### TinyGo ``` goos: darwin goarch: arm64 pkg: github.com/tetratelabs/wazero/internal/integration_test/stdlibs │ old_tinygo.txt │ new_tinygo.txt │ │ sec/op │ sec/op vs base │ TinyGo/Compile/container_heap.test-10 410.8m ± 1% 399.8m ± 0% -2.69% (p=0.001 n=7) TinyGo/Run/container_heap.test-10 14.41m ± 0% 14.29m ± 2% -0.77% (p=0.026 n=7) TinyGo/Compile/container_list.test-10 410.5m ± 1% 398.1m ± 0% -3.02% (p=0.001 n=7) TinyGo/Run/container_list.test-10 14.27m ± 2% 14.16m ± 1% ~ (p=0.073 n=7) TinyGo/Compile/container_ring.test-10 403.7m ± 1% 392.5m ± 2% -2.77% (p=0.001 n=7) TinyGo/Run/container_ring.test-10 14.24m ± 0% 14.27m ± 1% ~ (p=0.259 n=7) TinyGo/Compile/crypto_des.test-10 418.8m ± 0% 408.1m ± 0% -2.56% (p=0.001 n=7) TinyGo/Run/crypto_des.test-10 18.23m ± 0% 18.17m ± 1% ~ (p=0.456 n=7) TinyGo/Compile/crypto_md5.test-10 417.3m ± 2% 406.1m ± 1% -2.68% (p=0.001 n=7) TinyGo/Run/crypto_md5.test-10 20.50m ± 0% 20.45m ± 1% ~ (p=0.128 n=7) TinyGo/Compile/crypto_rc4.test-10 402.2m ± 1% 390.5m ± 0% -2.90% (p=0.001 n=7) TinyGo/Run/crypto_rc4.test-10 160.8m ± 0% 161.0m ± 1% ~ (p=1.000 n=7) TinyGo/Compile/crypto_sha1.test-10 417.2m ± 1% 404.5m ± 1% -3.04% (p=0.001 n=7) TinyGo/Run/crypto_sha1.test-10 15.93m ± 1% 15.90m ± 1% ~ (p=0.710 n=7) TinyGo/Compile/crypto_sha256.test-10 423.4m ± 1% 412.4m ± 1% -2.60% (p=0.001 n=7) TinyGo/Run/crypto_sha256.test-10 16.16m ± ∞ ¹ 16.05m ± ∞ ¹ ~ (p=0.381 n=2+5) geomean 94.17m 92.70m -1.56% ¹ need >= 6 samples for confidence interval at level 0.95 │ old_tinygo.txt │ new_tinygo.txt │ │ B/op │ B/op vs base │ TinyGo/Compile/container_heap.test-10 48.55Mi ± 0% 48.30Mi ± 0% -0.52% (p=0.001 n=7) TinyGo/Run/container_heap.test-10 16.63Mi ± 0% 16.63Mi ± 0% ~ (p=0.557 n=7) TinyGo/Compile/container_list.test-10 48.53Mi ± 0% 48.29Mi ± 0% -0.51% (p=0.001 n=7) TinyGo/Run/container_list.test-10 16.40Mi ± 0% 16.40Mi ± 0% ~ (p=0.364 n=7) TinyGo/Compile/container_ring.test-10 47.78Mi ± 0% 47.53Mi ± 0% -0.52% (p=0.001 n=7) TinyGo/Run/container_ring.test-10 16.30Mi ± 0% 16.30Mi ± 0% ~ (p=0.128 n=7) TinyGo/Compile/crypto_des.test-10 48.67Mi ± 0% 48.42Mi ± 0% -0.51% (p=0.001 n=7) TinyGo/Run/crypto_des.test-10 16.76Mi ± 0% 16.76Mi ± 0% ~ (p=0.902 n=7) TinyGo/Compile/crypto_md5.test-10 48.73Mi ± 0% 48.48Mi ± 0% -0.51% (p=0.001 n=7) TinyGo/Run/crypto_md5.test-10 44.97Mi ± 0% 44.97Mi ± 0% ~ (p=0.402 n=7) TinyGo/Compile/crypto_rc4.test-10 47.76Mi ± 0% 47.52Mi ± 0% -0.51% (p=0.001 n=7) TinyGo/Run/crypto_rc4.test-10 29.28Mi ± 0% 29.28Mi ± 0% ~ (p=0.104 n=7) TinyGo/Compile/crypto_sha1.test-10 48.97Mi ± 0% 48.72Mi ± 0% -0.52% (p=0.001 n=7) TinyGo/Run/crypto_sha1.test-10 17.44Mi ± 0% 17.44Mi ± 0% ~ (p=1.000 n=7) TinyGo/Compile/crypto_sha256.test-10 48.81Mi ± 0% 48.56Mi ± 0% -0.51% (p=0.001 n=7) TinyGo/Run/crypto_sha256.test-10 17.53Mi ± ∞ ¹ 17.53Mi ± ∞ ¹ ~ (p=0.381 n=2+5) geomean 31.45Mi 31.37Mi -0.26% ¹ need >= 6 samples for confidence interval at level 0.95 │ old_tinygo.txt │ new_tinygo.txt │ │ allocs/op │ allocs/op vs base │ TinyGo/Compile/container_heap.test-10 83.67k ± 0% 83.46k ± 0% -0.25% (p=0.011 n=7) TinyGo/Run/container_heap.test-10 374.9k ± 0% 374.9k ± 0% ~ (p=1.000 n=7) TinyGo/Compile/container_list.test-10 83.34k ± 0% 83.19k ± 0% -0.19% (p=0.002 n=7) TinyGo/Run/container_list.test-10 370.0k ± 0% 370.0k ± 0% ~ (p=0.674 n=7) TinyGo/Compile/container_ring.test-10 83.26k ± 0% 83.08k ± 0% -0.22% (p=0.004 n=7) TinyGo/Run/container_ring.test-10 367.6k ± 0% 367.6k ± 0% ~ (p=0.249 n=7) TinyGo/Compile/crypto_des.test-10 83.68k ± 0% 83.53k ± 0% -0.18% (p=0.004 n=7) TinyGo/Run/crypto_des.test-10 378.1k ± 0% 378.1k ± 0% ~ (p=0.437 n=7) TinyGo/Compile/crypto_md5.test-10 83.86k ± 0% 83.67k ± 0% -0.23% (p=0.001 n=7) TinyGo/Run/crypto_md5.test-10 393.3k ± 0% 393.3k ± 0% ~ (p=0.592 n=7) TinyGo/Compile/crypto_rc4.test-10 83.32k ± 0% 83.20k ± 0% -0.14% (p=0.011 n=7) TinyGo/Run/crypto_rc4.test-10 367.1k ± 0% 367.1k ± 0% ~ (p=0.102 n=7) TinyGo/Compile/crypto_sha1.test-10 84.05k ± 0% 83.87k ± 0% -0.21% (p=0.002 n=7) TinyGo/Run/crypto_sha1.test-10 392.7k ± 0% 392.7k ± 0% ~ (p=1.000 n=7) TinyGo/Compile/crypto_sha256.test-10 83.86k ± 0% 83.67k ± 0% -0.24% (p=0.001 n=7) TinyGo/Run/crypto_sha256.test-10 394.5k ± ∞ ¹ 394.5k ± ∞ ¹ ~ (p=0.952 n=2+5) geomean 178.2k 178.0k -0.10% ``` ### wazero compiled as wasip1 binary ``` goos: darwin goarch: arm64 pkg: github.com/tetratelabs/wazero │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ Compilation-10 2.413 ± 0% 2.258 ± 1% -6.42% (p=0.001 n=7) │ old.txt │ new.txt │ │ B/op │ B/op vs base │ Compilation-10 339.9Mi ± 0% 337.7Mi ± 0% -0.63% (p=0.001 n=7) │ old.txt │ new.txt │ │ allocs/op │ allocs/op vs base │ Compilation-10 603.9k ± 0% 602.4k ± 0% -0.25% (p=0.001 n=7) ``` Signed-off-by: Takeshi Yoneda --- internal/engine/wazevo/ssa/basic_block.go | 6 ++++- internal/engine/wazevo/ssa/builder.go | 7 ------ internal/engine/wazevo/ssa/pass.go | 22 ++++-------------- .../engine/wazevo/ssa/pass_blk_layouts.go | 14 +++++------ .../wazevo/ssa/pass_blk_layouts_test.go | 2 +- internal/engine/wazevo/ssa/pass_cfg.go | 23 ++++++++++--------- 6 files changed, 29 insertions(+), 45 deletions(-) diff --git a/internal/engine/wazevo/ssa/basic_block.go b/internal/engine/wazevo/ssa/basic_block.go index 2e7069dca4..39627b9898 100644 --- a/internal/engine/wazevo/ssa/basic_block.go +++ b/internal/engine/wazevo/ssa/basic_block.go @@ -112,7 +112,10 @@ type ( // reversePostOrder is used to sort all the blocks in the function in reverse post order. // This is used in builder.LayoutBlocks. - reversePostOrder int + reversePostOrder int32 + + // visited is used during various traversals. + visited int32 // child and sibling are the ones in the dominator tree. child, sibling *basicBlock @@ -274,6 +277,7 @@ func resetBasicBlock(bb *basicBlock) { bb.unknownValues = bb.unknownValues[:0] bb.lastDefinitions = wazevoapi.ResetMap(bb.lastDefinitions) bb.reversePostOrder = -1 + bb.visited = 0 bb.loopNestingForestChildren = basicBlockVarLengthNil bb.loopHeader = false bb.sibling = nil diff --git a/internal/engine/wazevo/ssa/builder.go b/internal/engine/wazevo/ssa/builder.go index 66d3d9eeff..21a308ba73 100644 --- a/internal/engine/wazevo/ssa/builder.go +++ b/internal/engine/wazevo/ssa/builder.go @@ -143,7 +143,6 @@ func NewBuilder() Builder { varLengthPool: wazevoapi.NewVarLengthPool[Value](), valueAnnotations: make(map[ValueID]string), signatures: make(map[SignatureID]*Signature), - blkVisited: make(map[*basicBlock]int), valueIDAliases: make(map[ValueID]Value), redundantParameterIndexToValue: make(map[int]Value), returnBlk: &basicBlock{id: basicBlockIDReturnBlock}, @@ -189,7 +188,6 @@ type builder struct { // The followings are used for optimization passes/deterministic compilation. instStack []*Instruction - blkVisited map[*basicBlock]int valueIDToInstruction []*Instruction blkStack []*basicBlock blkStack2 []*basicBlock @@ -266,11 +264,6 @@ func (b *builder) Init(s *Signature) { b.blkStack2 = b.blkStack2[:0] b.dominators = b.dominators[:0] b.loopNestingForestRoots = b.loopNestingForestRoots[:0] - - for i := 0; i < b.basicBlocksPool.Allocated(); i++ { - blk := b.basicBlocksPool.View(i) - delete(b.blkVisited, blk) - } b.basicBlocksPool.Reset() for v := ValueID(0); v < b.nextValueID; v++ { diff --git a/internal/engine/wazevo/ssa/pass.go b/internal/engine/wazevo/ssa/pass.go index c7ebb15218..e3ba4075df 100644 --- a/internal/engine/wazevo/ssa/pass.go +++ b/internal/engine/wazevo/ssa/pass.go @@ -78,12 +78,11 @@ func (b *builder) runFinalizingPasses() { // passDeadBlockEliminationOpt searches the unreachable blocks, and sets the basicBlock.invalid flag true if so. func passDeadBlockEliminationOpt(b *builder) { entryBlk := b.entryBlk() - b.clearBlkVisited() b.blkStack = append(b.blkStack, entryBlk) for len(b.blkStack) > 0 { reachableBlk := b.blkStack[len(b.blkStack)-1] b.blkStack = b.blkStack[:len(b.blkStack)-1] - b.blkVisited[reachableBlk] = 0 // the value won't be used in this pass. + reachableBlk.visited = 1 if !reachableBlk.sealed && !reachableBlk.ReturnBlock() { panic(fmt.Sprintf("%s is not sealed", reachableBlk)) @@ -94,7 +93,7 @@ func passDeadBlockEliminationOpt(b *builder) { } for _, succ := range reachableBlk.success { - if _, ok := b.blkVisited[succ]; ok { + if succ.visited == 1 { continue } b.blkStack = append(b.blkStack, succ) @@ -102,9 +101,10 @@ func passDeadBlockEliminationOpt(b *builder) { } for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() { - if _, ok := b.blkVisited[blk]; !ok { + if blk.visited != 1 { blk.invalid = true } + blk.visited = 0 } } @@ -121,7 +121,7 @@ func passRedundantPhiEliminationOpt(b *builder) { // the maximum number of iteration was 22, which seems to be acceptable but not that small either since the // complexity here is O(BlockNum * Iterations) at the worst case where BlockNum might be the order of thousands. // -- Note -- - // Currently, each iteration can run in an order of blocks, but it empirically converges quickly in practice when + // Currently, each iteration can run in any order of blocks, but it empirically converges quickly in practice when // running on the reverse post-order. It might be possible to optimize this further by using the dominator tree. for { changed := false @@ -355,18 +355,6 @@ func (b *builder) incRefCount(id ValueID, from *Instruction) { b.valueRefCounts[id]++ } -// clearBlkVisited clears the b.blkVisited map so that we can reuse it for multiple places. -func (b *builder) clearBlkVisited() { - b.blkStack2 = b.blkStack2[:0] - for key := range b.blkVisited { - b.blkStack2 = append(b.blkStack2, key) - } - for _, blk := range b.blkStack2 { - delete(b.blkVisited, blk) - } - b.blkStack2 = b.blkStack2[:0] -} - // passNopInstElimination eliminates the instructions which is essentially a no-op. func passNopInstElimination(b *builder) { if int(b.nextValueID) >= len(b.valueIDToInstruction) { diff --git a/internal/engine/wazevo/ssa/pass_blk_layouts.go b/internal/engine/wazevo/ssa/pass_blk_layouts.go index bf9063919d..584b5eadea 100644 --- a/internal/engine/wazevo/ssa/pass_blk_layouts.go +++ b/internal/engine/wazevo/ssa/pass_blk_layouts.go @@ -23,8 +23,6 @@ import ( // // This heuristic is done in maybeInvertBranches function. func passLayoutBlocks(b *builder) { - b.clearBlkVisited() - // We might end up splitting critical edges which adds more basic blocks, // so we store the currently existing basic blocks in nonSplitBlocks temporarily. // That way we can iterate over the original basic blocks while appending new ones into reversePostOrderedBasicBlocks. @@ -47,20 +45,20 @@ func passLayoutBlocks(b *builder) { for _, blk := range nonSplitBlocks { for i := range blk.preds { pred := blk.preds[i].blk - if _, ok := b.blkVisited[pred]; ok || !pred.Valid() { + if pred.visited == 1 || !pred.Valid() { continue } else if pred.reversePostOrder < blk.reversePostOrder { // This means the edge is critical, and this pred is the trampoline and yet to be inserted. // Split edge trampolines must come before the destination in reverse post-order. b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, pred) - b.blkVisited[pred] = 0 // mark as inserted, the value is not used. + pred.visited = 1 // mark as inserted. } } // Now that we've already added all the potential trampoline blocks incoming to this block, // we can add this block itself. b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, blk) - b.blkVisited[blk] = 0 // mark as inserted, the value is not used. + blk.visited = 1 // mark as inserted. if len(blk.success) < 2 { // There won't be critical edge originating from this block. @@ -116,7 +114,7 @@ func passLayoutBlocks(b *builder) { if fallthroughBranch.opcode == OpcodeJump && fallthroughBranch.blk == trampoline { // This can be lowered as fallthrough at the end of the block. b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline) - b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used. + trampoline.visited = 1 // mark as inserted. } else { uninsertedTrampolines = append(uninsertedTrampolines, trampoline) } @@ -126,7 +124,7 @@ func passLayoutBlocks(b *builder) { if trampoline.success[0].reversePostOrder <= trampoline.reversePostOrder { // "<=", not "<" because the target might be itself. // This means the critical edge was backward, so we insert after the current block immediately. b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline) - b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used. + trampoline.visited = 1 // mark as inserted. } // If the target is forward, we can wait to insert until the target is inserted. } uninsertedTrampolines = uninsertedTrampolines[:0] // Reuse the stack for the next block. @@ -142,7 +140,7 @@ func passLayoutBlocks(b *builder) { if wazevoapi.SSAValidationEnabled { for _, trampoline := range trampolines { - if _, ok := b.blkVisited[trampoline]; !ok { + if trampoline.visited != 1 { panic("BUG: trampoline block not inserted: " + trampoline.formatHeader(b)) } trampoline.validate(b) diff --git a/internal/engine/wazevo/ssa/pass_blk_layouts_test.go b/internal/engine/wazevo/ssa/pass_blk_layouts_test.go index f81bd80d84..cc4e004374 100644 --- a/internal/engine/wazevo/ssa/pass_blk_layouts_test.go +++ b/internal/engine/wazevo/ssa/pass_blk_layouts_test.go @@ -192,7 +192,7 @@ func TestBuilder_splitCriticalEdge(t *testing.T) { predInfo := &basicBlockPredecessorInfo{blk: predBlk, branch: originalBrz} trampoline := b.splitCriticalEdge(predBlk, dummyBlk, predInfo) require.NotNil(t, trampoline) - require.Equal(t, 100, trampoline.reversePostOrder) + require.Equal(t, int32(100), trampoline.reversePostOrder) require.Equal(t, trampoline, predInfo.blk) require.Equal(t, originalBrz, predInfo.branch) diff --git a/internal/engine/wazevo/ssa/pass_cfg.go b/internal/engine/wazevo/ssa/pass_cfg.go index 1bec185ee5..109438ad75 100644 --- a/internal/engine/wazevo/ssa/pass_cfg.go +++ b/internal/engine/wazevo/ssa/pass_cfg.go @@ -15,10 +15,6 @@ import ( // At the last of pass, this function also does the loop detection and sets the basicBlock.loop flag. func passCalculateImmediateDominators(b *builder) { reversePostOrder := b.reversePostOrderedBasicBlocks[:0] - exploreStack := b.blkStack[:0] - b.clearBlkVisited() - - entryBlk := b.entryBlk() // Store the reverse postorder from the entrypoint into reversePostOrder slice. // This calculation of reverse postorder is not described in the paper, @@ -28,14 +24,17 @@ func passCalculateImmediateDominators(b *builder) { // which is a reasonable assumption as long as SSA Builder is properly used. // // First we push blocks in postorder iteratively visit successors of the entry block. - exploreStack = append(exploreStack, entryBlk) + entryBlk := b.entryBlk() + exploreStack := append(b.blkStack[:0], entryBlk) + // These flags are used to track the state of the block in the DFS traversal. + // We temporarily use the reversePostOrder field to store the state. const visitStateUnseen, visitStateSeen, visitStateDone = 0, 1, 2 - b.blkVisited[entryBlk] = visitStateSeen + entryBlk.visited = visitStateSeen for len(exploreStack) > 0 { tail := len(exploreStack) - 1 blk := exploreStack[tail] exploreStack = exploreStack[:tail] - switch b.blkVisited[blk] { + switch blk.visited { case visitStateUnseen: // This is likely a bug in the frontend. panic("BUG: unsupported CFG") @@ -48,16 +47,18 @@ func passCalculateImmediateDominators(b *builder) { if succ.ReturnBlock() || succ.invalid { continue } - if b.blkVisited[succ] == visitStateUnseen { - b.blkVisited[succ] = visitStateSeen + if succ.visited == visitStateUnseen { + succ.visited = visitStateSeen exploreStack = append(exploreStack, succ) } } // Finally, we could pop this block once we pop all of its successors. - b.blkVisited[blk] = visitStateDone + blk.visited = visitStateDone case visitStateDone: // Note: at this point we push blk in postorder despite its name. reversePostOrder = append(reversePostOrder, blk) + default: + panic("BUG") } } // At this point, reversePostOrder has postorder actually, so we reverse it. @@ -67,7 +68,7 @@ func passCalculateImmediateDominators(b *builder) { } for i, blk := range reversePostOrder { - blk.reversePostOrder = i + blk.reversePostOrder = int32(i) } // Reuse the dominators slice if possible from the previous computation of function.