From ccef3a28a0ef87d2b10f42a105c36eb7a9e5c4c6 Mon Sep 17 00:00:00 2001 From: Hamza Khallouki Date: Tue, 4 Feb 2025 14:45:45 +0000 Subject: [PATCH] [AIE2P] Support fifo load with extra storage - InstructionSelect --- .../AIE/aie2p/AIE2PInstructionSelector.cpp | 67 +++++++++++++ .../lib/Target/AIE/aie2p/AIE2PRegisterInfo.td | 1 + .../GlobalIsel/inst-select-fifo-loads.mir | 56 +++++++++++ llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll | 94 +++++++++++++++++++ 4 files changed, 218 insertions(+) diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp index 36b1199c4c22..52394ab2ac85 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp @@ -121,6 +121,8 @@ class AIE2PInstructionSelector : public AIEBaseInstructionSelector { bool selectVLD_FIFO_POP_BFP16_1D(MachineInstr &I, MachineRegisterInfo &MRI); bool selectVLD_FIFO_POP_BFP16_2D(MachineInstr &I, MachineRegisterInfo &MRI); bool selectVLD_FIFO_POP_BFP16_3D(MachineInstr &I, MachineRegisterInfo &MRI); + bool selectVLD_FIFO_FILLX(MachineInstr &I, MachineRegisterInfo &MRI); + bool selectVLD_FIFO_POPX(MachineInstr &I, MachineRegisterInfo &MRI); bool selectSetI128(MachineInstr &I, MachineOperand &DstReg, MachineOperand &SrcReg, MachineRegisterInfo &MRI); bool selectExtractI128(MachineInstr &I, Register DstReg, Register SrcReg, @@ -395,6 +397,10 @@ bool AIE2PInstructionSelector::select(MachineInstr &I) { return selectVST_FIFO(I, MRI); case Intrinsic::aie2p_fifo_ld_fill: return selectVLD_FIFO_FILL(I, MRI); + case Intrinsic::aie2p_fifo_ld_fillx: + return selectVLD_FIFO_FILLX(I, MRI); + case Intrinsic::aie2p_fifo_ld_popx: + return selectVLD_FIFO_POPX(I, MRI); case Intrinsic::aie2p_fifo_ld_pop_unaligned: return selectVLD_FIFO_POP_512(I, MRI); case Intrinsic::aie2p_fifo_ld_pop_1d_unaligned: @@ -2575,6 +2581,10 @@ unsigned getLoadFifoOpcode(MachineInstr &I) { return AIE2P::VLDB_POP_544_3D; case Intrinsic::aie2p_fifo_ld_pop_576_3d_bfp16: return AIE2P::VLDB_POP_576_3D; + case Intrinsic::aie2p_fifo_ld_fillx: + return AIE2P::VLDB_FILLX_512; + case Intrinsic::aie2p_fifo_ld_popx: + return AIE2P::VLDB_POPX_512; } llvm_unreachable("unreachable: Failed to get sparse load opcode"); return AIE2P::INSTRUCTION_LIST_END; @@ -2596,6 +2606,34 @@ bool AIE2PInstructionSelector::selectVLD_FIFO_FILL(MachineInstr &I, return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); } +bool AIE2PInstructionSelector::selectVLD_FIFO_FILLX(MachineInstr &I, + MachineRegisterInfo &MRI) { + unsigned IntrinsicID = cast(I).getIntrinsicID(); + assert(IntrinsicID == Intrinsic::aie2p_fifo_ld_fillx); + Register PtrOut = I.getOperand(0).getReg(); + Register FifoOut = I.getOperand(1).getReg(); + Register AvailOut = I.getOperand(2).getReg(); + Register ExtraOut = I.getOperand(3).getReg(); + Register PtrIn = I.getOperand(5).getReg(); + Register FifoIn = I.getOperand(6).getReg(); + Register AvailIn = I.getOperand(7).getReg(); + Register ExtraIn = I.getOperand(8).getReg(); + Register ConfIn = I.getOperand(9).getReg(); + + MIB.buildInstr(TargetOpcode::COPY, {AIE2P::lfe}, {}).addReg(ExtraIn); + MachineInstrBuilder MI = + MIB.buildInstr(getLoadFifoOpcode(I), {PtrOut, FifoOut, AvailOut}, + {ConfIn, ConfIn, PtrIn, FifoIn, AvailIn}); + auto CopyBackLfeMI = + MIB.buildInstr(TargetOpcode::COPY, {ExtraOut}, {}).addReg(AIE2P::lfe); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI) && + constrainOperandRegClass(*MF, TRI, MRI, TII, RBI, *CopyBackLfeMI, + AIE2P::mExtraRegClass, + CopyBackLfeMI->getOperand(0)); +} + bool AIE2PInstructionSelector::selectVLD_FIFO_POP_512( MachineInstr &I, MachineRegisterInfo &MRI) { unsigned IntrinsicID = cast(I).getIntrinsicID(); @@ -2614,6 +2652,35 @@ bool AIE2PInstructionSelector::selectVLD_FIFO_POP_512( return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); } +bool AIE2PInstructionSelector::selectVLD_FIFO_POPX(MachineInstr &I, + MachineRegisterInfo &MRI) { + unsigned IntrinsicID = cast(I).getIntrinsicID(); + assert(IntrinsicID == Intrinsic::aie2p_fifo_ld_popx); + Register VecOut = I.getOperand(0).getReg(); + Register PtrOut = I.getOperand(1).getReg(); + Register FifoOut = I.getOperand(2).getReg(); + Register AvailOut = I.getOperand(3).getReg(); + Register ExtraOut = I.getOperand(4).getReg(); + Register PtrIn = I.getOperand(6).getReg(); + Register FifoIn = I.getOperand(7).getReg(); + Register AvailIn = I.getOperand(8).getReg(); + Register ExtraIn = I.getOperand(9).getReg(); + Register ConfIn = I.getOperand(10).getReg(); + + MIB.buildInstr(TargetOpcode::COPY, {AIE2P::lfe}, {}).addReg(ExtraIn); + MachineInstrBuilder MI = + MIB.buildInstr(getLoadFifoOpcode(I), {VecOut, PtrOut, FifoOut, AvailOut}, + {ConfIn, ConfIn, PtrIn, FifoIn, AvailIn}); + auto CopyBackLfeMI = + MIB.buildInstr(TargetOpcode::COPY, {ExtraOut}, {}).addReg(AIE2P::lfe); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI) && + constrainOperandRegClass(*MF, TRI, MRI, TII, RBI, *CopyBackLfeMI, + AIE2P::mExtraRegClass, + CopyBackLfeMI->getOperand(0)); +} + bool AIE2PInstructionSelector::selectVLD_FIFO_POP_512_1D( MachineInstr &I, MachineRegisterInfo &MRI) { unsigned IntrinsicID = cast(I).getIntrinsicID(); diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PRegisterInfo.td b/llvm/lib/Target/AIE/aie2p/AIE2PRegisterInfo.td index f689281c6aaf..adc322b352f1 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PRegisterInfo.td +++ b/llvm/lib/Target/AIE/aie2p/AIE2PRegisterInfo.td @@ -302,6 +302,7 @@ class AIE2PDim3DRegisterClass def eLdFifoReg : AIE2PVector1024RegisterClass<(add lf0, lf1)>; def mFifoHLReg : AIE2PVector512RegisterClass<(add sfh, sfl, lfh0, lfh1, lfl0, lfl1, lfe)>; + def mExtra : AIE2PVector512RegisterClass<(add lfe)>; def mStFifo : AIE2PVector1024RegisterClass<(add sf)>; def mStFifoh : AIE2PVector512RegisterClass<(add sfh)>; def mStFifol : AIE2PVector512RegisterClass<(add sfl)>; diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-loads.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-loads.mir index 8a02b3526913..c8621bd111cf 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-loads.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-fifo-loads.mir @@ -29,6 +29,62 @@ body: | PseudoRET implicit $lr, implicit %7, implicit %8, implicit %9 ... +--- +name: ld_fillx +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: + ; CHECK-LABEL: name: ld_fillx + ; CHECK: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:fifo512 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:mr30_fifo_step_e1 = IMPLICIT_DEF + ; CHECK-NEXT: $lfe = COPY [[DEF3]] + ; CHECK-NEXT: [[VLDB_FILLX_512_:%[0-9]+]]:eps, [[VLDB_FILLX_512_1:%[0-9]+]]:eldfiforeg, [[VLDB_FILLX_512_2:%[0-9]+]]:erf2 = VLDB_FILLX_512 [[DEF4]], [[DEF4]], [[DEF]], [[DEF1]], [[DEF2]], implicit-def $lfe, implicit $lfe + ; CHECK-NEXT: [[COPY:%[0-9]+]]:mextra = COPY $lfe + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDB_FILLX_512_]], implicit [[VLDB_FILLX_512_1]], implicit [[VLDB_FILLX_512_2]], implicit [[COPY]] + %2:modregbank(s20) = G_IMPLICIT_DEF + %4:ptrregbank(p0) = G_IMPLICIT_DEF + %5:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %6:gprregbank(s32) = G_IMPLICIT_DEF + %10:fiforegbank(<16 x s32>) = G_IMPLICIT_DEF + %11:gprregbank(s32) = G_IMPLICIT_DEF + %7:ptrregbank(p0), %8:fiforegbank(<32 x s32>), %9:gprregbank(s32), %12:fiforegbank(<16 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.fillx), %4:ptrregbank(p0), %5:fiforegbank(<32 x s32>), %6:gprregbank(s32), %10:fiforegbank(<16 x s32>), %11:gprregbank(s32) + PseudoRET implicit $lr, implicit %7, implicit %8, implicit %9, implicit %12 +... + +--- +name: ld_popx +tracksRegLiveness: true +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: + ; CHECK-LABEL: name: ld_popx + ; CHECK: [[DEF:%[0-9]+]]:eps = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:eldfiforeg = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:erf2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:fifo512 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:mr30_fifo_step_e1 = IMPLICIT_DEF + ; CHECK-NEXT: $lfe = COPY [[DEF3]] + ; CHECK-NEXT: [[VLDB_POPX_512_:%[0-9]+]]:vec512, [[VLDB_POPX_512_1:%[0-9]+]]:eps, [[VLDB_POPX_512_2:%[0-9]+]]:eldfiforeg, [[VLDB_POPX_512_3:%[0-9]+]]:erf2 = VLDB_POPX_512 [[DEF4]], [[DEF4]], [[DEF]], [[DEF1]], [[DEF2]], implicit-def $lfe, implicit-def $srfifo_uf, implicit $lfe + ; CHECK-NEXT: [[COPY:%[0-9]+]]:mextra = COPY $lfe + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDB_POPX_512_]], implicit [[VLDB_POPX_512_1]], implicit [[VLDB_POPX_512_2]], implicit [[VLDB_POPX_512_3]], implicit [[COPY]] + %3:modregbank(s20) = G_IMPLICIT_DEF + %5:ptrregbank(p0) = G_IMPLICIT_DEF + %6:fiforegbank(<32 x s32>) = G_IMPLICIT_DEF + %7:gprregbank(s32) = G_IMPLICIT_DEF + %12:fiforegbank(<16 x s32>) = G_IMPLICIT_DEF + %13:gprregbank(s32) = G_IMPLICIT_DEF + %8:vregbank(<64 x s8>), %9:ptrregbank(p0), %10:fiforegbank(<32 x s32>), %11:gprregbank(s32), %14:fiforegbank(<16 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.fifo.ld.popx), %5:ptrregbank(p0), %6:fiforegbank(<32 x s32>), %7:gprregbank(s32), %12:fiforegbank(<16 x s32>), %13:gprregbank(s32) + PseudoRET implicit $lr, implicit %8, implicit %9, implicit %10, implicit %11, implicit %14 +... + --- name: pop_unaligned tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll b/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll index e7f98db34fbb..db8a52529a4f 100644 --- a/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll +++ b/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll @@ -624,6 +624,98 @@ entry: ret %struct.v64bfp16ebs16 %.fca.1.insert.i } +define dso_local noundef <64 x i8> @_Z17test_fifo_ld_popxRPDv64_hR12fifo_state_t(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s) local_unnamed_addr #0 { +; CHECK-LABEL: _Z17test_fifo_ld_popxRPDv64_hR12fifo_state_t: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vlda lfl0, [p1, #0] +; CHECK-NEXT: vlda x0, [p1, #192]; mov p2, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: movxm r30, #2015 +; CHECK-NEXT: vldb.popx.512 x0, [p0, lf0, r24] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vmov lfe, x0 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vst lfl0, [p1, #0]; ret lr +; CHECK-NEXT: st r24, [p1, dj0]; vmov x2, lfe // Delay Slot 5 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst x2, [p1, #192] // Delay Slot 3 +; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i.i = getelementptr inbounds i8, ptr %s, i20 128 + %extra3.i.i = getelementptr inbounds i8, ptr %s, i20 192 + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i.i, align 64, !tbaa !7 + %3 = load <16 x i32>, ptr %extra3.i.i, align 64, !tbaa !6 + %4 = tail call { <64 x i8>, ptr, <32 x i32>, i32, <16 x i32> } @llvm.aie2p.fifo.ld.popx(ptr %0, <32 x i32> %1, i32 %2, <16 x i32> %3, i32 2015) + %5 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, <16 x i32> } %4, 0 + %6 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, <16 x i32> } %4, 1 + %7 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, <16 x i32> } %4, 2 + %8 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, <16 x i32> } %4, 3 + %9 = extractvalue { <64 x i8>, ptr, <32 x i32>, i32, <16 x i32> } %4, 4 + store <16 x i32> %9, ptr %extra3.i.i, align 64 + store <32 x i32> %7, ptr %s, align 128 + store i32 %8, ptr %pos1.i.i, align 64 + store ptr %6, ptr %p, align 4 + ret <64 x i8> %5 +} + +define dso_local void @_Z18test_fifo_ld_fillxRP22v64bfp16ebs8_unalignedR12fifo_state_tii(ptr nocapture nonnull align 4 dereferenceable(4) %p, ptr nocapture nonnull align 64 dereferenceable(256) %s, i32 noundef %step, i32 noundef %mask) local_unnamed_addr #0 { +; CHECK-LABEL: _Z18test_fifo_ld_fillxRP22v64bfp16ebs8_unalignedR12fifo_state_tii: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov dj0, #128; nopv +; CHECK-NEXT: lda r24, [p1, dj0]; nopx +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vlda lfl0, [p1, #0] +; CHECK-NEXT: vlda x0, [p1, #192]; movx r2, #6; mov p2, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64]; lshl r0, r0, r2 +; CHECK-NEXT: or r30, r0, r1 +; CHECK-NEXT: vldb.fillx.512 [p0, lf0, r24] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vmov lfe, x0 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vst lfl0, [p1, #0]; ret lr +; CHECK-NEXT: st r24, [p1, dj0]; vmov x0, lfe // Delay Slot 5 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 4 +; CHECK-NEXT: vst x0, [p1, #192] // Delay Slot 3 +; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %pos1.i = getelementptr inbounds i8, ptr %s, i20 128 + %extra3.i = getelementptr inbounds i8, ptr %s, i20 192 + %shl.i = shl i32 %step, 6 + %or.i = or i32 %shl.i, %mask + %0 = load ptr, ptr %p, align 4, !tbaa !2 + %1 = load <32 x i32>, ptr %s, align 64, !tbaa !6 + %2 = load i32, ptr %pos1.i, align 64, !tbaa !7 + %3 = load <16 x i32>, ptr %extra3.i, align 64, !tbaa !6 + %4 = tail call { ptr, <32 x i32>, i32, <16 x i32> } @llvm.aie2p.fifo.ld.fillx(ptr %0, <32 x i32> %1, i32 %2, <16 x i32> %3, i32 %or.i) + %5 = extractvalue { ptr, <32 x i32>, i32, <16 x i32> } %4, 0 + %6 = extractvalue { ptr, <32 x i32>, i32, <16 x i32> } %4, 1 + %7 = extractvalue { ptr, <32 x i32>, i32, <16 x i32> } %4, 2 + %8 = extractvalue { ptr, <32 x i32>, i32, <16 x i32> } %4, 3 + store <16 x i32> %8, ptr %extra3.i, align 64 + store <32 x i32> %6, ptr %s, align 128 + store i32 %7, ptr %pos1.i, align 64 + store ptr %5, ptr %p, align 4 + ret void +} + + declare { ptr, <32 x i32>, i32 } @llvm.aie2p.fifo.ld.fill(ptr, <32 x i32>, i32) #5 @@ -639,6 +731,8 @@ declare { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.54 declare { ptr, <32 x i32>, i32, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.544.1d.bfp16(ptr, <32 x i32>, i32, i20) #5 declare { ptr, <32 x i32>, i32, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.544.2d.bfp16(ptr, <32 x i32>, i32, i20, i20, i20, i20) #5 declare { ptr, <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.544.3d.bfp16(ptr, <32 x i32>, i32, i20, i20, i20, i20, i20, i20, i20) #5 +declare { <64 x i8>, ptr, <32 x i32>, i32, <16 x i32> } @llvm.aie2p.fifo.ld.popx(ptr, <32 x i32>, i32, <16 x i32>, i32) #1 +declare { ptr, <32 x i32>, i32, <16 x i32> } @llvm.aie2p.fifo.ld.fillx(ptr, <32 x i32>, i32, <16 x i32>, i32) #1 !2 = !{!3, !3, i64 0} !3 = !{!"any pointer", !4, i64 0}