From 747609b0f5d3adff694f3795ff04cefe2b7b3504 Mon Sep 17 00:00:00 2001
From: Takeshi Yoneda <t.y.mathetake@gmail.com>
Date: Fri, 7 Jun 2024 11:42:45 -0700
Subject: [PATCH] ssa: removes map use for block traversals (#2235)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This removes the use of map in basic block traversals.
As a result, overall compilation perf improves like the below:

### Zig
```
goos: darwin
goarch: arm64
pkg: github.com/tetratelabs/wazero/internal/integration_test/stdlibs
                             │ old_zig.txt │            new_zig.txt            │
                             │   sec/op    │   sec/op    vs base               │
Zig/Compile/test-opt.wasm-10    4.438 ± 1%   3.778 ± 0%  -14.87% (p=0.002 n=6)
Zig/Run/test-opt.wasm-10        18.77 ± 1%   18.76 ± 0%        ~ (p=0.818 n=6)
Zig/Compile/test.wasm-10        5.083 ± 0%   4.673 ± 0%   -8.07% (p=0.002 n=6)
Zig/Run/test.wasm-10            19.27 ± 1%   19.30 ± 1%        ~ (p=0.699 n=6)
geomean                         9.504        8.941        -5.92%

                             │ old_zig.txt  │            new_zig.txt             │
                             │     B/op     │     B/op      vs base              │
Zig/Compile/test-opt.wasm-10   396.7Mi ± 0%   394.7Mi ± 0%  -0.51% (p=0.002 n=6)
Zig/Run/test-opt.wasm-10       741.7Mi ± 0%   741.7Mi ± 0%       ~ (p=0.900 n=6)
Zig/Compile/test.wasm-10       660.0Mi ± 0%   659.5Mi ± 0%  -0.08% (p=0.002 n=6)
Zig/Run/test.wasm-10           1.296Gi ± 0%   1.296Gi ± 0%       ~ (p=0.892 n=6)
geomean                        712.6Mi        711.5Mi       -0.15%

                             │ old_zig.txt │            new_zig.txt            │
                             │  allocs/op  │  allocs/op   vs base              │
Zig/Compile/test-opt.wasm-10   363.2k ± 0%   362.6k ± 0%  -0.17% (p=0.002 n=6)
Zig/Run/test-opt.wasm-10       51.58k ± 0%   51.58k ± 0%       ~ (p=0.933 n=6)
Zig/Compile/test.wasm-10       515.2k ± 0%   515.4k ± 0%       ~ (p=0.485 n=6)
Zig/Run/test.wasm-10           2.156M ± 0%   2.156M ± 0%       ~ (p=0.998 n=6)
geomean                        379.8k        379.7k       -0.03%
```

### wasip1

```
goos: darwin
goarch: arm64
pkg: github.com/tetratelabs/wazero/internal/integration_test/stdlibs
                                            │ old_wasip1.txt │            new_wasip1.txt             │
                                            │     sec/op     │    sec/op      vs base                │
Wasip1/Compile/src_archive_tar.test-10            2.198 ± 1%    2.067 ± 1%    -5.96% (p=0.001 n=7)
Wasip1/Run/src_archive_tar.test-10               398.8m ± 0%   398.8m ± 0%         ~ (p=0.902 n=7)
Wasip1/Compile/src_bufio.test-10                  1.492 ± 0%    1.409 ± 1%    -5.57% (p=0.001 n=7)
Wasip1/Run/src_bufio.test-10                     120.5m ± 1%   121.0m ± 1%    +0.44% (p=0.017 n=7)
Wasip1/Compile/src_bytes.test-10                  1.543 ± 0%    1.454 ± 0%    -5.72% (p=0.001 n=7)
Wasip1/Run/src_bytes.test-10                     469.0m ± 1%   467.4m ± 1%         ~ (p=0.209 n=7)
Wasip1/Compile/src_context.test-10                1.664 ± 0%    1.564 ± 1%    -6.00% (p=0.001 n=7)
Wasip1/Run/src_context.test-10                   31.54m ± 1%   31.57m ± 0%         ~ (p=0.445 n=6+7)
Wasip1/Compile/src_encoding_ascii85.test-10                     1.261 ±  ∞ ¹
geomean                                          527.3m        565.9m         -2.92%
¹ need >= 6 samples for confidence interval at level 0.95

                                            │ old_wasip1.txt │             new_wasip1.txt             │
                                            │      B/op      │      B/op       vs base                │
Wasip1/Compile/src_archive_tar.test-10          93.44Mi ± 0%   93.17Mi ± 0%    -0.30% (p=0.001 n=7)
Wasip1/Run/src_archive_tar.test-10              286.0Mi ± 0%   286.0Mi ± 0%         ~ (p=0.593 n=7)
Wasip1/Compile/src_bufio.test-10                74.38Mi ± 0%   74.13Mi ± 0%    -0.35% (p=0.001 n=7)
Wasip1/Run/src_bufio.test-10                    105.3Mi ± 0%   105.3Mi ± 0%         ~ (p=0.780 n=7)
Wasip1/Compile/src_bytes.test-10                75.58Mi ± 0%   75.32Mi ± 0%    -0.35% (p=0.001 n=7)
Wasip1/Run/src_bytes.test-10                    605.0Mi ± 0%   605.0Mi ± 0%         ~ (p=0.331 n=7)
Wasip1/Compile/src_context.test-10              78.33Mi ± 0%   78.07Mi ± 0%    -0.33% (p=0.001 n=7)
Wasip1/Run/src_context.test-10                  71.52Mi ± 0%   71.52Mi ± 0%         ~ (p=1.000 n=6+7)
Wasip1/Compile/src_encoding_ascii85.test-10                    70.38Mi ±  ∞ ¹
geomean                                         123.4Mi        115.7Mi         -0.17%
¹ need >= 6 samples for confidence interval at level 0.95

                                            │ old_wasip1.txt │             new_wasip1.txt              │
                                            │   allocs/op    │   allocs/op    vs base                  │
Wasip1/Compile/src_archive_tar.test-10           265.4k ± 0%   265.0k ± 0%    -0.16% (p=0.001 n=7)
Wasip1/Run/src_archive_tar.test-10               7.831k ± 0%   7.830k ± 0%         ~ (p=1.000 n=7)
Wasip1/Compile/src_bufio.test-10                 195.6k ± 0%   195.4k ± 0%    -0.12% (p=0.001 n=7)
Wasip1/Run/src_bufio.test-10                     3.728k ± 0%   3.728k ± 0%         ~ (p=1.000 n=7)   ¹
Wasip1/Compile/src_bytes.test-10                 204.1k ± 0%   203.7k ± 0%    -0.20% (p=0.001 n=7)
Wasip1/Run/src_bytes.test-10                     6.377k ± 0%   6.377k ± 0%         ~ (p=1.000 n=7)
Wasip1/Compile/src_context.test-10               221.7k ± 0%   221.6k ± 0%    -0.06% (p=0.001 n=7)
Wasip1/Run/src_context.test-10                   3.814k ± 0%   3.814k ± 1%         ~ (p=0.140 n=6+7)
Wasip1/Compile/src_encoding_ascii85.test-10                    182.3k ±  ∞ ²
geomean                                          33.71k        40.64k         -0.07%
¹ all samples are equal
² need >= 6 samples for confidence interval at level 0.95
```


### TinyGo
```
goos: darwin
goarch: arm64
pkg: github.com/tetratelabs/wazero/internal/integration_test/stdlibs
                                      │ old_tinygo.txt │            new_tinygo.txt             │
                                      │     sec/op     │    sec/op      vs base                │
TinyGo/Compile/container_heap.test-10    410.8m ± 1%     399.8m ± 0%    -2.69% (p=0.001 n=7)
TinyGo/Run/container_heap.test-10        14.41m ± 0%     14.29m ± 2%    -0.77% (p=0.026 n=7)
TinyGo/Compile/container_list.test-10    410.5m ± 1%     398.1m ± 0%    -3.02% (p=0.001 n=7)
TinyGo/Run/container_list.test-10        14.27m ± 2%     14.16m ± 1%         ~ (p=0.073 n=7)
TinyGo/Compile/container_ring.test-10    403.7m ± 1%     392.5m ± 2%    -2.77% (p=0.001 n=7)
TinyGo/Run/container_ring.test-10        14.24m ± 0%     14.27m ± 1%         ~ (p=0.259 n=7)
TinyGo/Compile/crypto_des.test-10        418.8m ± 0%     408.1m ± 0%    -2.56% (p=0.001 n=7)
TinyGo/Run/crypto_des.test-10            18.23m ± 0%     18.17m ± 1%         ~ (p=0.456 n=7)
TinyGo/Compile/crypto_md5.test-10        417.3m ± 2%     406.1m ± 1%    -2.68% (p=0.001 n=7)
TinyGo/Run/crypto_md5.test-10            20.50m ± 0%     20.45m ± 1%         ~ (p=0.128 n=7)
TinyGo/Compile/crypto_rc4.test-10        402.2m ± 1%     390.5m ± 0%    -2.90% (p=0.001 n=7)
TinyGo/Run/crypto_rc4.test-10            160.8m ± 0%     161.0m ± 1%         ~ (p=1.000 n=7)
TinyGo/Compile/crypto_sha1.test-10       417.2m ± 1%     404.5m ± 1%    -3.04% (p=0.001 n=7)
TinyGo/Run/crypto_sha1.test-10           15.93m ± 1%     15.90m ± 1%         ~ (p=0.710 n=7)
TinyGo/Compile/crypto_sha256.test-10     423.4m ± 1%     412.4m ± 1%    -2.60% (p=0.001 n=7)
TinyGo/Run/crypto_sha256.test-10         16.16m ±  ∞ ¹   16.05m ±  ∞ ¹       ~ (p=0.381 n=2+5)
geomean                                  94.17m          92.70m         -1.56%
¹ need >= 6 samples for confidence interval at level 0.95

                                      │ old_tinygo.txt │             new_tinygo.txt             │
                                      │      B/op      │      B/op       vs base                │
TinyGo/Compile/container_heap.test-10   48.55Mi ± 0%     48.30Mi ± 0%    -0.52% (p=0.001 n=7)
TinyGo/Run/container_heap.test-10       16.63Mi ± 0%     16.63Mi ± 0%         ~ (p=0.557 n=7)
TinyGo/Compile/container_list.test-10   48.53Mi ± 0%     48.29Mi ± 0%    -0.51% (p=0.001 n=7)
TinyGo/Run/container_list.test-10       16.40Mi ± 0%     16.40Mi ± 0%         ~ (p=0.364 n=7)
TinyGo/Compile/container_ring.test-10   47.78Mi ± 0%     47.53Mi ± 0%    -0.52% (p=0.001 n=7)
TinyGo/Run/container_ring.test-10       16.30Mi ± 0%     16.30Mi ± 0%         ~ (p=0.128 n=7)
TinyGo/Compile/crypto_des.test-10       48.67Mi ± 0%     48.42Mi ± 0%    -0.51% (p=0.001 n=7)
TinyGo/Run/crypto_des.test-10           16.76Mi ± 0%     16.76Mi ± 0%         ~ (p=0.902 n=7)
TinyGo/Compile/crypto_md5.test-10       48.73Mi ± 0%     48.48Mi ± 0%    -0.51% (p=0.001 n=7)
TinyGo/Run/crypto_md5.test-10           44.97Mi ± 0%     44.97Mi ± 0%         ~ (p=0.402 n=7)
TinyGo/Compile/crypto_rc4.test-10       47.76Mi ± 0%     47.52Mi ± 0%    -0.51% (p=0.001 n=7)
TinyGo/Run/crypto_rc4.test-10           29.28Mi ± 0%     29.28Mi ± 0%         ~ (p=0.104 n=7)
TinyGo/Compile/crypto_sha1.test-10      48.97Mi ± 0%     48.72Mi ± 0%    -0.52% (p=0.001 n=7)
TinyGo/Run/crypto_sha1.test-10          17.44Mi ± 0%     17.44Mi ± 0%         ~ (p=1.000 n=7)
TinyGo/Compile/crypto_sha256.test-10    48.81Mi ± 0%     48.56Mi ± 0%    -0.51% (p=0.001 n=7)
TinyGo/Run/crypto_sha256.test-10        17.53Mi ±  ∞ ¹   17.53Mi ±  ∞ ¹       ~ (p=0.381 n=2+5)
geomean                                 31.45Mi          31.37Mi         -0.26%
¹ need >= 6 samples for confidence interval at level 0.95

                                      │ old_tinygo.txt │            new_tinygo.txt             │
                                      │   allocs/op    │   allocs/op    vs base                │
TinyGo/Compile/container_heap.test-10    83.67k ± 0%     83.46k ± 0%    -0.25% (p=0.011 n=7)
TinyGo/Run/container_heap.test-10        374.9k ± 0%     374.9k ± 0%         ~ (p=1.000 n=7)
TinyGo/Compile/container_list.test-10    83.34k ± 0%     83.19k ± 0%    -0.19% (p=0.002 n=7)
TinyGo/Run/container_list.test-10        370.0k ± 0%     370.0k ± 0%         ~ (p=0.674 n=7)
TinyGo/Compile/container_ring.test-10    83.26k ± 0%     83.08k ± 0%    -0.22% (p=0.004 n=7)
TinyGo/Run/container_ring.test-10        367.6k ± 0%     367.6k ± 0%         ~ (p=0.249 n=7)
TinyGo/Compile/crypto_des.test-10        83.68k ± 0%     83.53k ± 0%    -0.18% (p=0.004 n=7)
TinyGo/Run/crypto_des.test-10            378.1k ± 0%     378.1k ± 0%         ~ (p=0.437 n=7)
TinyGo/Compile/crypto_md5.test-10        83.86k ± 0%     83.67k ± 0%    -0.23% (p=0.001 n=7)
TinyGo/Run/crypto_md5.test-10            393.3k ± 0%     393.3k ± 0%         ~ (p=0.592 n=7)
TinyGo/Compile/crypto_rc4.test-10        83.32k ± 0%     83.20k ± 0%    -0.14% (p=0.011 n=7)
TinyGo/Run/crypto_rc4.test-10            367.1k ± 0%     367.1k ± 0%         ~ (p=0.102 n=7)
TinyGo/Compile/crypto_sha1.test-10       84.05k ± 0%     83.87k ± 0%    -0.21% (p=0.002 n=7)
TinyGo/Run/crypto_sha1.test-10           392.7k ± 0%     392.7k ± 0%         ~ (p=1.000 n=7)
TinyGo/Compile/crypto_sha256.test-10     83.86k ± 0%     83.67k ± 0%    -0.24% (p=0.001 n=7)
TinyGo/Run/crypto_sha256.test-10         394.5k ±  ∞ ¹   394.5k ±  ∞ ¹       ~ (p=0.952 n=2+5)
geomean                                  178.2k          178.0k         -0.10%
```

### wazero compiled as wasip1 binary

```
goos: darwin
goarch: arm64
pkg: github.com/tetratelabs/wazero
               │  old.txt   │             new.txt              │
               │   sec/op   │   sec/op    vs base              │
Compilation-10   2.413 ± 0%   2.258 ± 1%  -6.42% (p=0.001 n=7)

               │   old.txt    │              new.txt               │
               │     B/op     │     B/op      vs base              │
Compilation-10   339.9Mi ± 0%   337.7Mi ± 0%  -0.63% (p=0.001 n=7)

               │   old.txt   │              new.txt              │
               │  allocs/op  │  allocs/op   vs base              │
Compilation-10   603.9k ± 0%   602.4k ± 0%  -0.25% (p=0.001 n=7)
```


Signed-off-by: Takeshi Yoneda <t.y.mathetake@gmail.com>
---
 internal/engine/wazevo/ssa/basic_block.go     |  6 ++++-
 internal/engine/wazevo/ssa/builder.go         |  7 ------
 internal/engine/wazevo/ssa/pass.go            | 22 ++++--------------
 .../engine/wazevo/ssa/pass_blk_layouts.go     | 14 +++++------
 .../wazevo/ssa/pass_blk_layouts_test.go       |  2 +-
 internal/engine/wazevo/ssa/pass_cfg.go        | 23 ++++++++++---------
 6 files changed, 29 insertions(+), 45 deletions(-)

diff --git a/internal/engine/wazevo/ssa/basic_block.go b/internal/engine/wazevo/ssa/basic_block.go
index 2e7069dca4..39627b9898 100644
--- a/internal/engine/wazevo/ssa/basic_block.go
+++ b/internal/engine/wazevo/ssa/basic_block.go
@@ -112,7 +112,10 @@ type (
 
 		// reversePostOrder is used to sort all the blocks in the function in reverse post order.
 		// This is used in builder.LayoutBlocks.
-		reversePostOrder int
+		reversePostOrder int32
+
+		// visited is used during various traversals.
+		visited int32
 
 		// child and sibling are the ones in the dominator tree.
 		child, sibling *basicBlock
@@ -274,6 +277,7 @@ func resetBasicBlock(bb *basicBlock) {
 	bb.unknownValues = bb.unknownValues[:0]
 	bb.lastDefinitions = wazevoapi.ResetMap(bb.lastDefinitions)
 	bb.reversePostOrder = -1
+	bb.visited = 0
 	bb.loopNestingForestChildren = basicBlockVarLengthNil
 	bb.loopHeader = false
 	bb.sibling = nil
diff --git a/internal/engine/wazevo/ssa/builder.go b/internal/engine/wazevo/ssa/builder.go
index 66d3d9eeff..21a308ba73 100644
--- a/internal/engine/wazevo/ssa/builder.go
+++ b/internal/engine/wazevo/ssa/builder.go
@@ -143,7 +143,6 @@ func NewBuilder() Builder {
 		varLengthPool:                  wazevoapi.NewVarLengthPool[Value](),
 		valueAnnotations:               make(map[ValueID]string),
 		signatures:                     make(map[SignatureID]*Signature),
-		blkVisited:                     make(map[*basicBlock]int),
 		valueIDAliases:                 make(map[ValueID]Value),
 		redundantParameterIndexToValue: make(map[int]Value),
 		returnBlk:                      &basicBlock{id: basicBlockIDReturnBlock},
@@ -189,7 +188,6 @@ type builder struct {
 
 	// The followings are used for optimization passes/deterministic compilation.
 	instStack                      []*Instruction
-	blkVisited                     map[*basicBlock]int
 	valueIDToInstruction           []*Instruction
 	blkStack                       []*basicBlock
 	blkStack2                      []*basicBlock
@@ -266,11 +264,6 @@ func (b *builder) Init(s *Signature) {
 	b.blkStack2 = b.blkStack2[:0]
 	b.dominators = b.dominators[:0]
 	b.loopNestingForestRoots = b.loopNestingForestRoots[:0]
-
-	for i := 0; i < b.basicBlocksPool.Allocated(); i++ {
-		blk := b.basicBlocksPool.View(i)
-		delete(b.blkVisited, blk)
-	}
 	b.basicBlocksPool.Reset()
 
 	for v := ValueID(0); v < b.nextValueID; v++ {
diff --git a/internal/engine/wazevo/ssa/pass.go b/internal/engine/wazevo/ssa/pass.go
index c7ebb15218..e3ba4075df 100644
--- a/internal/engine/wazevo/ssa/pass.go
+++ b/internal/engine/wazevo/ssa/pass.go
@@ -78,12 +78,11 @@ func (b *builder) runFinalizingPasses() {
 // passDeadBlockEliminationOpt searches the unreachable blocks, and sets the basicBlock.invalid flag true if so.
 func passDeadBlockEliminationOpt(b *builder) {
 	entryBlk := b.entryBlk()
-	b.clearBlkVisited()
 	b.blkStack = append(b.blkStack, entryBlk)
 	for len(b.blkStack) > 0 {
 		reachableBlk := b.blkStack[len(b.blkStack)-1]
 		b.blkStack = b.blkStack[:len(b.blkStack)-1]
-		b.blkVisited[reachableBlk] = 0 // the value won't be used in this pass.
+		reachableBlk.visited = 1
 
 		if !reachableBlk.sealed && !reachableBlk.ReturnBlock() {
 			panic(fmt.Sprintf("%s is not sealed", reachableBlk))
@@ -94,7 +93,7 @@ func passDeadBlockEliminationOpt(b *builder) {
 		}
 
 		for _, succ := range reachableBlk.success {
-			if _, ok := b.blkVisited[succ]; ok {
+			if succ.visited == 1 {
 				continue
 			}
 			b.blkStack = append(b.blkStack, succ)
@@ -102,9 +101,10 @@ func passDeadBlockEliminationOpt(b *builder) {
 	}
 
 	for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
-		if _, ok := b.blkVisited[blk]; !ok {
+		if blk.visited != 1 {
 			blk.invalid = true
 		}
+		blk.visited = 0
 	}
 }
 
@@ -121,7 +121,7 @@ func passRedundantPhiEliminationOpt(b *builder) {
 	//  the maximum number of iteration was 22, which seems to be acceptable but not that small either since the
 	//  complexity here is O(BlockNum * Iterations) at the worst case where BlockNum might be the order of thousands.
 	//  -- Note --
-	// 	Currently, each iteration can run in an order of blocks, but it empirically converges quickly in practice when
+	// 	Currently, each iteration can run in any order of blocks, but it empirically converges quickly in practice when
 	// 	running on the reverse post-order. It might be possible to optimize this further by using the dominator tree.
 	for {
 		changed := false
@@ -355,18 +355,6 @@ func (b *builder) incRefCount(id ValueID, from *Instruction) {
 	b.valueRefCounts[id]++
 }
 
-// clearBlkVisited clears the b.blkVisited map so that we can reuse it for multiple places.
-func (b *builder) clearBlkVisited() {
-	b.blkStack2 = b.blkStack2[:0]
-	for key := range b.blkVisited {
-		b.blkStack2 = append(b.blkStack2, key)
-	}
-	for _, blk := range b.blkStack2 {
-		delete(b.blkVisited, blk)
-	}
-	b.blkStack2 = b.blkStack2[:0]
-}
-
 // passNopInstElimination eliminates the instructions which is essentially a no-op.
 func passNopInstElimination(b *builder) {
 	if int(b.nextValueID) >= len(b.valueIDToInstruction) {
diff --git a/internal/engine/wazevo/ssa/pass_blk_layouts.go b/internal/engine/wazevo/ssa/pass_blk_layouts.go
index bf9063919d..584b5eadea 100644
--- a/internal/engine/wazevo/ssa/pass_blk_layouts.go
+++ b/internal/engine/wazevo/ssa/pass_blk_layouts.go
@@ -23,8 +23,6 @@ import (
 //
 // This heuristic is done in maybeInvertBranches function.
 func passLayoutBlocks(b *builder) {
-	b.clearBlkVisited()
-
 	// We might end up splitting critical edges which adds more basic blocks,
 	// so we store the currently existing basic blocks in nonSplitBlocks temporarily.
 	// That way we can iterate over the original basic blocks while appending new ones into reversePostOrderedBasicBlocks.
@@ -47,20 +45,20 @@ func passLayoutBlocks(b *builder) {
 	for _, blk := range nonSplitBlocks {
 		for i := range blk.preds {
 			pred := blk.preds[i].blk
-			if _, ok := b.blkVisited[pred]; ok || !pred.Valid() {
+			if pred.visited == 1 || !pred.Valid() {
 				continue
 			} else if pred.reversePostOrder < blk.reversePostOrder {
 				// This means the edge is critical, and this pred is the trampoline and yet to be inserted.
 				// Split edge trampolines must come before the destination in reverse post-order.
 				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, pred)
-				b.blkVisited[pred] = 0 // mark as inserted, the value is not used.
+				pred.visited = 1 // mark as inserted.
 			}
 		}
 
 		// Now that we've already added all the potential trampoline blocks incoming to this block,
 		// we can add this block itself.
 		b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, blk)
-		b.blkVisited[blk] = 0 // mark as inserted, the value is not used.
+		blk.visited = 1 // mark as inserted.
 
 		if len(blk.success) < 2 {
 			// There won't be critical edge originating from this block.
@@ -116,7 +114,7 @@ func passLayoutBlocks(b *builder) {
 			if fallthroughBranch.opcode == OpcodeJump && fallthroughBranch.blk == trampoline {
 				// This can be lowered as fallthrough at the end of the block.
 				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline)
-				b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used.
+				trampoline.visited = 1 // mark as inserted.
 			} else {
 				uninsertedTrampolines = append(uninsertedTrampolines, trampoline)
 			}
@@ -126,7 +124,7 @@ func passLayoutBlocks(b *builder) {
 			if trampoline.success[0].reversePostOrder <= trampoline.reversePostOrder { // "<=", not "<" because the target might be itself.
 				// This means the critical edge was backward, so we insert after the current block immediately.
 				b.reversePostOrderedBasicBlocks = append(b.reversePostOrderedBasicBlocks, trampoline)
-				b.blkVisited[trampoline] = 0 // mark as inserted, the value is not used.
+				trampoline.visited = 1 // mark as inserted.
 			} // If the target is forward, we can wait to insert until the target is inserted.
 		}
 		uninsertedTrampolines = uninsertedTrampolines[:0] // Reuse the stack for the next block.
@@ -142,7 +140,7 @@ func passLayoutBlocks(b *builder) {
 
 	if wazevoapi.SSAValidationEnabled {
 		for _, trampoline := range trampolines {
-			if _, ok := b.blkVisited[trampoline]; !ok {
+			if trampoline.visited != 1 {
 				panic("BUG: trampoline block not inserted: " + trampoline.formatHeader(b))
 			}
 			trampoline.validate(b)
diff --git a/internal/engine/wazevo/ssa/pass_blk_layouts_test.go b/internal/engine/wazevo/ssa/pass_blk_layouts_test.go
index f81bd80d84..cc4e004374 100644
--- a/internal/engine/wazevo/ssa/pass_blk_layouts_test.go
+++ b/internal/engine/wazevo/ssa/pass_blk_layouts_test.go
@@ -192,7 +192,7 @@ func TestBuilder_splitCriticalEdge(t *testing.T) {
 	predInfo := &basicBlockPredecessorInfo{blk: predBlk, branch: originalBrz}
 	trampoline := b.splitCriticalEdge(predBlk, dummyBlk, predInfo)
 	require.NotNil(t, trampoline)
-	require.Equal(t, 100, trampoline.reversePostOrder)
+	require.Equal(t, int32(100), trampoline.reversePostOrder)
 
 	require.Equal(t, trampoline, predInfo.blk)
 	require.Equal(t, originalBrz, predInfo.branch)
diff --git a/internal/engine/wazevo/ssa/pass_cfg.go b/internal/engine/wazevo/ssa/pass_cfg.go
index 1bec185ee5..109438ad75 100644
--- a/internal/engine/wazevo/ssa/pass_cfg.go
+++ b/internal/engine/wazevo/ssa/pass_cfg.go
@@ -15,10 +15,6 @@ import (
 // At the last of pass, this function also does the loop detection and sets the basicBlock.loop flag.
 func passCalculateImmediateDominators(b *builder) {
 	reversePostOrder := b.reversePostOrderedBasicBlocks[:0]
-	exploreStack := b.blkStack[:0]
-	b.clearBlkVisited()
-
-	entryBlk := b.entryBlk()
 
 	// Store the reverse postorder from the entrypoint into reversePostOrder slice.
 	// This calculation of reverse postorder is not described in the paper,
@@ -28,14 +24,17 @@ func passCalculateImmediateDominators(b *builder) {
 	// which is a reasonable assumption as long as SSA Builder is properly used.
 	//
 	// First we push blocks in postorder iteratively visit successors of the entry block.
-	exploreStack = append(exploreStack, entryBlk)
+	entryBlk := b.entryBlk()
+	exploreStack := append(b.blkStack[:0], entryBlk)
+	// These flags are used to track the state of the block in the DFS traversal.
+	// We temporarily use the reversePostOrder field to store the state.
 	const visitStateUnseen, visitStateSeen, visitStateDone = 0, 1, 2
-	b.blkVisited[entryBlk] = visitStateSeen
+	entryBlk.visited = visitStateSeen
 	for len(exploreStack) > 0 {
 		tail := len(exploreStack) - 1
 		blk := exploreStack[tail]
 		exploreStack = exploreStack[:tail]
-		switch b.blkVisited[blk] {
+		switch blk.visited {
 		case visitStateUnseen:
 			// This is likely a bug in the frontend.
 			panic("BUG: unsupported CFG")
@@ -48,16 +47,18 @@ func passCalculateImmediateDominators(b *builder) {
 				if succ.ReturnBlock() || succ.invalid {
 					continue
 				}
-				if b.blkVisited[succ] == visitStateUnseen {
-					b.blkVisited[succ] = visitStateSeen
+				if succ.visited == visitStateUnseen {
+					succ.visited = visitStateSeen
 					exploreStack = append(exploreStack, succ)
 				}
 			}
 			// Finally, we could pop this block once we pop all of its successors.
-			b.blkVisited[blk] = visitStateDone
+			blk.visited = visitStateDone
 		case visitStateDone:
 			// Note: at this point we push blk in postorder despite its name.
 			reversePostOrder = append(reversePostOrder, blk)
+		default:
+			panic("BUG")
 		}
 	}
 	// At this point, reversePostOrder has postorder actually, so we reverse it.
@@ -67,7 +68,7 @@ func passCalculateImmediateDominators(b *builder) {
 	}
 
 	for i, blk := range reversePostOrder {
-		blk.reversePostOrder = i
+		blk.reversePostOrder = int32(i)
 	}
 
 	// Reuse the dominators slice if possible from the previous computation of function.