From a6fea913e89ecaab88e470c5491ad8056d3dea3f Mon Sep 17 00:00:00 2001 From: Takeshi Yoneda Date: Thu, 14 Dec 2023 12:45:44 -0800 Subject: [PATCH] wazevo(arm64): relocations for large conditional branches (#1873) Signed-off-by: Takeshi Yoneda --- .../engine/wazevo/backend/isa/arm64/instr.go | 6 +- .../backend/isa/arm64/instr_encoding_test.go | 2 +- .../wazevo/backend/isa/arm64/machine.go | 125 ++++++++++++++---- .../wazevo/backend/isa/arm64/machine_test.go | 97 ++++++++++++++ 4 files changed, 205 insertions(+), 25 deletions(-) diff --git a/internal/engine/wazevo/backend/isa/arm64/instr.go b/internal/engine/wazevo/backend/isa/arm64/instr.go index 3f5a33ec1c..3aa50e4c8a 100644 --- a/internal/engine/wazevo/backend/isa/arm64/instr.go +++ b/internal/engine/wazevo/backend/isa/arm64/instr.go @@ -647,7 +647,7 @@ func (i *instruction) brLabel() label { } // brOffsetResolved is called when the target label is resolved. -func (i *instruction) brOffsetResolved(offset int64) { +func (i *instruction) brOffsetResolve(offset int64) { i.u2 = uint64(offset) i.u3 = 1 // indicate that the offset is resolved, for debugging. } @@ -666,6 +666,10 @@ func (i *instruction) asCondBr(c cond, target label, is64bit bool) { } } +func (i *instruction) setCondBrTargets(target label) { + i.u2 = uint64(target) +} + func (i *instruction) condBrLabel() label { return label(i.u2) } diff --git a/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go b/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go index 58242bf619..924a81197c 100644 --- a/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go +++ b/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go @@ -1280,7 +1280,7 @@ func TestInstruction_encode(t *testing.T) { }}, {want: "20000014", setup: func(i *instruction) { i.asBr(dummyLabel) - i.brOffsetResolved(0x80) + i.brOffsetResolve(0x80) }}, {want: "01040034", setup: func(i *instruction) { i.asCondBr(registerAsRegZeroCond(x1VReg), dummyLabel, false) diff --git a/internal/engine/wazevo/backend/isa/arm64/machine.go b/internal/engine/wazevo/backend/isa/arm64/machine.go index 59d60e2952..ecd9d0ccb5 100644 --- a/internal/engine/wazevo/backend/isa/arm64/machine.go +++ b/internal/engine/wazevo/backend/isa/arm64/machine.go @@ -43,6 +43,9 @@ type ( addends64 queue[regalloc.VReg] unresolvedAddressModes []*instruction + // condBrRelocs holds the conditional branches which need offset relocation. + condBrRelocs []condBrReloc + // spillSlotSize is the size of the stack slot in bytes used for spilling registers. // During the execution of the function, the stack looks like: // @@ -98,10 +101,20 @@ type ( // labelPosition represents the regions of the generated code which the label represents. labelPosition struct { + l label begin, end *instruction binarySize int64 binaryOffset int64 } + + condBrReloc struct { + cbr *instruction + // currentLabelPos is the labelPosition within which condBr is defined. + currentLabelPos *labelPosition + // Next block's labelPosition. + nextLabel label + offset int64 + } ) const ( @@ -205,7 +218,7 @@ func (m *machine) StartBlock(blk ssa.BasicBlock) { labelPos, ok := m.labelPositions[l] if !ok { - labelPos = m.allocateLabelPosition() + labelPos = m.allocateLabelPosition(l) m.labelPositions[l] = labelPos } m.orderedBlockLabels = append(m.orderedBlockLabels, labelPos) @@ -231,18 +244,24 @@ func (m *machine) insert(i *instruction) { } func (m *machine) insertBrTargetLabel() label { - l := m.allocateLabel() - nop := m.allocateInstr() - nop.asNop0WithLabel(l) + nop, l := m.allocateBrTarget() m.insert(nop) - pos := m.allocateLabelPosition() + return l +} + +func (m *machine) allocateBrTarget() (nop *instruction, l label) { + l = m.allocateLabel() + nop = m.allocateInstr() + nop.asNop0WithLabel(l) + pos := m.allocateLabelPosition(l) pos.begin, pos.end = nop, nop m.labelPositions[l] = pos - return l + return } -func (m *machine) allocateLabelPosition() *labelPosition { +func (m *machine) allocateLabelPosition(la label) *labelPosition { l := m.labelPositionPool.Allocate() + l.l = la return l } @@ -344,17 +363,34 @@ func (m *machine) ResolveRelativeAddresses() { } } + // Reuse the slice to gather the unresolved conditional branches. + cbrs := m.condBrRelocs[:0] + // Next, in order to determine the offsets of relative jumps, we have to calculate the size of each label. var offset int64 - for _, pos := range m.orderedBlockLabels { + for i, pos := range m.orderedBlockLabels { pos.binaryOffset = offset var size int64 for cur := pos.begin; ; cur = cur.next { - if cur.kind == nop0 { + switch cur.kind { + case nop0: l := cur.nop0Label() if pos, ok := m.labelPositions[l]; ok { pos.binaryOffset = offset + size } + case condBr: + if !cur.condBrOffsetResolved() { + var nextLabel label + if i < len(m.orderedBlockLabels)-1 { + // Note: this is only used when the block ends with fallthrough, + // therefore can be safely assumed that the next block exists when it's needed. + nextLabel = m.orderedBlockLabels[i+1].l + } + cbrs = append(cbrs, condBrReloc{ + cbr: cur, currentLabelPos: pos, offset: offset + size, + nextLabel: nextLabel, + }) + } } size += cur.size() if cur == pos.end { @@ -365,6 +401,30 @@ func (m *machine) ResolveRelativeAddresses() { offset += size } + // Before resolving any offsets, we need to check if all the conditional branches can be resolved. + var needRerun bool + for i := range cbrs { + reloc := &cbrs[i] + cbr := reloc.cbr + offset := reloc.offset + + target := cbr.condBrLabel() + offsetOfTarget := m.labelPositions[target].binaryOffset + diff := offsetOfTarget - offset + if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 { + // This case the conditional branch is too huge. We place the trampoline instructions at the end of the current block, + // and jump to it. + m.insertConditionalJumpTrampoline(cbr, reloc.currentLabelPos, reloc.nextLabel) + // Then, we need to recall this function to fix up the label offsets + // as they have changed after the trampoline is inserted. + needRerun = true + } + } + if needRerun { + m.ResolveRelativeAddresses() + return + } + var currentOffset int64 for cur := m.rootInstr; cur != nil; cur = cur.next { switch cur.kind { @@ -372,29 +432,19 @@ func (m *machine) ResolveRelativeAddresses() { target := cur.brLabel() offsetOfTarget := m.labelPositions[target].binaryOffset diff := offsetOfTarget - currentOffset - if diff%4 != 0 { - panic("BUG: offsets between b and the target must be a multiple of 4") - } divided := diff >> 2 if divided < minSignedInt26 || divided > maxSignedInt26 { // This means the currently compiled single function is extremely large. - panic("BUG: implement branch relocation for large unconditional branch larger than 26-bit range") + panic("too large function that requires branch relocation of large unconditional branch larger than 26-bit range") } - cur.brOffsetResolved(diff) + cur.brOffsetResolve(diff) case condBr: if !cur.condBrOffsetResolved() { target := cur.condBrLabel() offsetOfTarget := m.labelPositions[target].binaryOffset diff := offsetOfTarget - currentOffset - if diff%4 != 0 { - panic("BUG: offsets between b and the target must be a multiple of 4") - } - divided := diff >> 2 - if divided < minSignedInt19 || divided > maxSignedInt19 { - // This case we can insert "trampoline block" in the middle and jump to it. - // After that, we need to re-calculate the offset of labels after the trampoline block by - // recursively calling this function. - panic("TODO: implement branch relocation for large conditional branch larger than 19-bit range") + if divided := diff >> 2; divided < minSignedInt19 || divided > maxSignedInt19 { + panic("BUG: branch relocation for large conditional branch larger than 19-bit range must be handled properly") } cur.condBrOffsetResolve(diff) } @@ -421,6 +471,35 @@ const ( minSignedInt19 int64 = -(1 << 19) ) +func (m *machine) insertConditionalJumpTrampoline(cbr *instruction, currentBlk *labelPosition, nextLabel label) { + cur := currentBlk.end + originalTarget := cbr.condBrLabel() + endNext := cur.next + + if cur.kind != br { + // If the current block ends with a conditional branch, we can just insert the trampoline after it. + // Otherwise, we need to insert "skip" instruction to skip the trampoline instructions. + skip := m.allocateInstr() + skip.asBr(nextLabel) + cur = linkInstr(cur, skip) + } + + cbrNewTargetInstr, cbrNewTargetLabel := m.allocateBrTarget() + cbr.setCondBrTargets(cbrNewTargetLabel) + cur = linkInstr(cur, cbrNewTargetInstr) + + // Then insert the unconditional branch to the original, which should be possible to get encoded + // as 26-bit offset should be enough for any practical application. + br := m.allocateInstr() + br.asBr(originalTarget) + cur = linkInstr(cur, br) + + // Update the end of the current block. + currentBlk.end = cur + + linkInstr(cur, endNext) +} + func (m *machine) getOrAllocateSSABlockLabel(blk ssa.BasicBlock) label { if blk.ReturnBlock() { return returnLabel diff --git a/internal/engine/wazevo/backend/isa/arm64/machine_test.go b/internal/engine/wazevo/backend/isa/arm64/machine_test.go index da4227c7af..48e3dd3039 100644 --- a/internal/engine/wazevo/backend/isa/arm64/machine_test.go +++ b/internal/engine/wazevo/backend/isa/arm64/machine_test.go @@ -123,3 +123,100 @@ func TestMachine_getVRegSpillSlotOffsetFromSP(t *testing.T) { _, ok = m.spillSlots[id] require.True(t, ok) } + +func TestMachine_insertConditionalJumpTrampoline(t *testing.T) { + for _, tc := range []struct { + brAtEnd bool + expBefore, expAfter string + }{ + { + brAtEnd: true, + expBefore: ` +L100: + b.eq L12345 + b L888888888 +L200: + exit_sequence x0 +`, + expAfter: ` +L100: + b.eq L10000000 + b L888888888 +L10000000: + b L12345 +L200: + exit_sequence x0 +`, + }, + { + brAtEnd: false, + expBefore: ` +L100: + b.eq L12345 + udf +L200: + exit_sequence x0 +`, + expAfter: ` +L100: + b.eq L10000000 + udf + b L200 +L10000000: + b L12345 +L200: + exit_sequence x0 +`, + }, + } { + var name string + if tc.brAtEnd { + name = "brAtEnd" + } else { + name = "brNotAtEnd" + } + + t.Run(name, func(t *testing.T) { + m := NewBackend().(*machine) + const ( + originLabel = 100 + originLabelNext = 200 + targetLabel = 12345 + ) + + cbr := m.allocateInstr() + cbr.asCondBr(eq.asCond(), targetLabel, false) + + end := m.allocateInstr() + if tc.brAtEnd { + end.asBr(888888888) + } else { + end.asUDF() + } + + originalEndNext := m.allocateInstr() + originalEndNext.asExitSequence(x0VReg) + + originLabelPos := m.allocateLabelPosition(originLabel) + originLabelPos.begin = cbr + originLabelPos.end = linkInstr(cbr, end) + originNextLabelPos := m.allocateLabelPosition(originLabelNext) + originNextLabelPos.begin = originalEndNext + linkInstr(originLabelPos.end, originalEndNext) + + m.labelPositions[originLabel] = originLabelPos + m.labelPositions[originLabelNext] = originNextLabelPos + + m.rootInstr = cbr + require.Equal(t, tc.expBefore, m.Format()) + + m.nextLabel = 9999999 + m.insertConditionalJumpTrampoline(cbr, originLabelPos, originLabelNext) + + require.Equal(t, tc.expAfter, m.Format()) + + // The original label position should be updated to the unconditional jump to the original target destination. + require.Equal(t, "b L12345", originLabelPos.end.String()) + }) + } +}