From bc1e5c6a096d8f963d9730c929989b46c6bf4464 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Mon, 5 Aug 2024 11:57:47 +0100 Subject: [PATCH 01/31] [AIE2] NFC: Add baseline test for complex loop-aware sched convergence We want to increase the safety margin for one instruciton at a time here, instead of doing it for all instructions at once. --- .../aie2/schedule/loopaware/Add2D-like.mir | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir new file mode 100644 index 000000000000..74765446d4f5 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir @@ -0,0 +1,109 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py + +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 --run-pass=postmisched \ +# RUN: %s -o - | FileCheck %s + +# A simplified example of a SW-pipelined 2xVLD.UPS -> VADD -> VST.SRS loop. +# We want to make sure the VST.SRS ends up in the last cycle of the loop, +# and the VLD.UPS in the first cycles. +# This means that in the fixpoint loop for loop-aware-scheduling, one needs to +# increase the safety margin for one instruction at a time: The VLDs need to be +# pushed up, not the VST. +# FIXME: Actually do this. +--- +name: add2d +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: add2d + ; CHECK: bb.0 (align 16): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $dj0, $dj4, $dn0, $dn4, $m0, $m1, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4, $r5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $dc0 = MOVA_lda_cg 0 + ; CHECK-NEXT: BUNDLE implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit-def $p1, implicit-def $srups_of, implicit-def $dc4, implicit $s1, implicit killed $p1, implicit $m1, implicit $crsat, implicit $crupssign, implicit $dc0 { + ; CHECK-NEXT: renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; CHECK-NEXT: $dc4 = MOV_mv_scl $dc0 + ; CHECK-NEXT: } + ; CHECK-NEXT: $cm4, $p2, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 killed $s1, killed $p2, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $s1 = MOV_mv_scl killed $r2 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: BUNDLE implicit-def $r1, implicit-def dead $srcarry, implicit-def $s0, implicit killed $r1, implicit killed $r4 { + ; CHECK-NEXT: renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry + ; CHECK-NEXT: $s0 = MOV_mv_scl killed $r4 + ; CHECK-NEXT: } + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1 (align 16): + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $cm0, $cm4, $dc0, $dc4, $dj0, $dj4, $dn0, $dn4, $m0, $m1, $p1, $p2, $p3, $r0, $r1, $s0, $s1, $d0_3d:0x000000000001C870 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $cm8 = VADD killed renamable $cm4, killed renamable $cm0, renamable $r0 + ; CHECK-NEXT: renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; CHECK-NEXT: BUNDLE implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit-def $p2, implicit-def $dc0, implicit-def $dc4, implicit-def $srups_of, implicit-def $r1, implicit-def dead $srcarry, implicit $s1, implicit killed $p2, implicit $d0_3d, implicit $crsat, implicit $crupssign, implicit killed $r1 { + ; CHECK-NEXT: $cm4, $p2, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s1, killed $p2, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry + ; CHECK-NEXT: } + ; CHECK-NEXT: JNZ renamable $r1, %bb.1 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: renamable $p3 = VST_SRS_D8_S32_ag_pstm_nrm_imm killed renamable $p3, 32, killed renamable $cm8, renamable $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $cm0, $cm4, $p3, $r0, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $cm0 = VADD killed renamable $cm4, killed renamable $cm0, killed renamable $r0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: renamable $p3 = VST_SRS_D8_S32_ag_pstm_nrm_imm killed renamable $p3, 32, killed renamable $cm0, killed renamable $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + bb.0 (align 16): + successors: %bb.1 + liveins: $dj0, $dj4, $dn0, $dn4, $m0, $m1, $p1, $p2, $p3, $r0, $r1, $r2, $r3, $r4, $r5 + + renamable $dc0 = MOV_PD_imm10_pseudo 0 + $s1 = MOV_mv_scl killed $r2 + $s0 = MOV_mv_scl killed $r4 + renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + $dc4 = MOV_mv_scl $dc0 + $cm4, $p2, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s1, killed $p2, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry + + bb.1 (align 16): + successors: %bb.1, %bb.2 + liveins: $cm0, $cm4, $dc0, $dc4, $dj0, $dj4, $dn0, $dn4, $m0, $m1, $p1, $p2, $p3, $r0, $r1, $s0, $s1, $d0_3d:0x000000000001C870 + + renamable $cm8 = VADD killed renamable $cm4, killed renamable $cm0, renamable $r0 + renamable $p3 = VST_SRS_D8_S32_ag_pstm_nrm_imm killed renamable $p3, 32, killed renamable $cm8, renamable $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + $cm4, $p2, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s1, killed $p2, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry + JNZ renamable $r1, %bb.1 + DelayedSchedBarrier + + bb.2: + liveins: $cm0, $cm4, $p3, $r0, $s0 + + renamable $cm0 = VADD killed renamable $cm4, killed renamable $cm0, renamable $r0 + renamable $p3 = VST_SRS_D8_S32_ag_pstm_nrm_imm killed renamable $p3, 32, killed renamable $cm0, renamable $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign + RET implicit $lr + DelayedSchedBarrier +... From a0bd75369b4c2426fb5b8f3c62527e798ec4e6d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Mon, 5 Aug 2024 12:27:46 +0100 Subject: [PATCH 02/31] [AIEX] Loop-aware sched: Increase latency margin per instruction --- .../Target/AIE/AIEInterBlockScheduling.cpp | 45 +++++++++++----- llvm/lib/Target/AIE/AIEInterBlockScheduling.h | 8 ++- llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp | 2 +- .../AIE/aie2/end-to-end/Conv2D-red-swp.ll | 52 +++++++++---------- .../aie2/schedule/loopaware/Add2D-like.mir | 16 +++--- .../aie2/schedule/loopaware/short-hwloop.mir | 7 ++- 6 files changed, 76 insertions(+), 54 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index e163e6f8ab29..b19a7545752c 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -37,6 +37,10 @@ static cl::opt LoopEpilogueAnalysis( "aie-loop-epilogue-analysis", cl::init(true), cl::desc("[AIE] Perform Loop/Epilogue analysis with loop scheduling")); +static cl::opt MaxExpensiveIterations( + "aie-loop-aware-expensive-iterations", cl::init(25), + cl::desc("[AIE] Perform Loop/Epilogue analysis with loop scheduling")); + namespace llvm::AIE { void dumpInterBlock(const InterBlockEdges &Edges) { @@ -166,10 +170,13 @@ bool InterBlockScheduling::leaveBlock() { if (BS.Kind == BlockType::Loop && !updateFixPoint(BS)) { BS.FixPoint.NumIters++; // Iterate on CurrentBlock + // We will first try to increase the latency margin for one instruction at + // a time, before increasing that margin for all instructions at once. // If we are very unlucky, we may step both the latency margin and // the resource margin to the max. Any more indicates failure to converge, // and we abort to prevent an infinite loop. - if (BS.FixPoint.NumIters > 2 * HR->getConflictHorizon()) { + if (BS.FixPoint.NumIters > + MaxExpensiveIterations + 2 * HR->getConflictHorizon()) { report_fatal_error("Inter-block scheduling did not converge."); } return false; @@ -219,7 +226,7 @@ bool InterBlockScheduling::resourcesConverged(BlockState &BS) const { return true; } -bool InterBlockScheduling::latencyConverged(BlockState &BS) const { +MachineInstr *InterBlockScheduling::latencyConverged(BlockState &BS) const { const auto &SubTarget = BS.TheBlock->getParent()->getSubtarget(); auto *TII = static_cast(SubTarget.getInstrInfo()); auto *ItinData = SubTarget.getInstrItineraryData(); @@ -283,7 +290,7 @@ bool InterBlockScheduling::latencyConverged(BlockState &BS) const { << " not met (" << Distance << ")\n"); DEBUG_LOOPAWARE(dbgs() << " " << Succ->NodeNum << ": " << *Succ->getInstr()); - return false; + return Pred->getInstr(); } } } @@ -296,7 +303,7 @@ bool InterBlockScheduling::latencyConverged(BlockState &BS) const { // upperbound of the latency safety margin that should be provided by // the epilogue BS.FixPoint.MaxLatencyExtent = MaxExtent; - return true; + return nullptr; } bool InterBlockScheduling::updateFixPoint(BlockState &BS) { @@ -316,11 +323,20 @@ bool InterBlockScheduling::updateFixPoint(BlockState &BS) { // Iterate on CurMBB return false; } - if (!latencyConverged(BS)) { - BS.FixPoint.LatencyMargin++; + + if (MachineInstr *MINeedsHigherCap = latencyConverged(BS)) { + auto Res = BS.FixPoint.PerMILatencyMargin.try_emplace(MINeedsHigherCap, 0); + // Increase the latency margin per instruction, unless we already iterated + // more than MaxExpensiveIterations without converging. + if (BS.FixPoint.NumIters <= MaxExpensiveIterations) { + ++Res.first->second; + } else { + BS.FixPoint.LatencyMargin++; + } DEBUG_LOOPAWARE(dbgs() << " not converged: latency RM=" - << BS.FixPoint.ResourceMargin << " LM=>" - << BS.FixPoint.LatencyMargin << "\n"); + << BS.FixPoint.ResourceMargin + << " LM=" << BS.FixPoint.LatencyMargin + << " MIM=" << Res.first->second << "\n"); // Iterate on CurMBB return false; } @@ -341,13 +357,18 @@ bool InterBlockScheduling::successorsAreScheduled( }); } -std::optional -InterBlockScheduling::getLatencyCap(MachineBasicBlock *BB) const { - auto &BS = getBlockState(BB); +std::optional InterBlockScheduling::getLatencyCap(MachineInstr &MI) const { + auto &BS = getBlockState(MI.getParent()); if (BS.Kind != BlockType::Loop) { return {}; } - return BS.FixPoint.LatencyMargin; + if (BS.FixPoint.LatencyMargin) + return BS.FixPoint.LatencyMargin; + if (const auto *It = BS.FixPoint.PerMILatencyMargin.find(&MI); + It != BS.FixPoint.PerMILatencyMargin.end()) { + return It->second; + } + return 0; } std::optional diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h index 71e2a10bbfdb..8fb452a656b3 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h @@ -110,6 +110,7 @@ class FixedpointState { public: bool IsScheduled = false; int LatencyMargin = 0; + SmallMapVector PerMILatencyMargin; int ResourceMargin = 0; // Results from the convergence test int MaxLatencyExtent = 0; @@ -245,7 +246,10 @@ class InterBlockScheduling { /// The two components of the convergence test bool resourcesConverged(BlockState &BS) const; - bool latencyConverged(BlockState &BS) const; + + /// Return one instruction that needs a higher latency cap, or nullptr if all + /// latencies converged. + MachineInstr *latencyConverged(BlockState &BS) const; /// After finding the loops, determine the epilogue blocks void markEpilogueBlocks(); @@ -301,7 +305,7 @@ class InterBlockScheduling { /// Return the maximum interblock latency we need to account for /// for the given successor. This represents the latency margin we assume for /// an unscheduled successor. - std::optional getLatencyCap(MachineBasicBlock *BB) const; + std::optional getLatencyCap(MachineInstr &MI) const; /// Return the maximum number of cycles to block for the given successor. /// This represents the resource usage we assume for an unscheduled successor. diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp index 6d4e0dd611d2..433c8a4780fa 100644 --- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp +++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp @@ -173,7 +173,7 @@ unsigned MaxLatencyFinder::operator()(MachineInstr &MI) { // scheduling a loop. const AIE::InterBlockScheduling &IB = Scheduler->getInterBlock(); if (!InterBlock) { - if (auto Cap = IB.getLatencyCap(CurBB)) { + if (auto Cap = IB.getLatencyCap(MI)) { LLVM_DEBUG(dbgs() << "Capped at " << *Cap << "\n"); Latency = std::min(Latency, *Cap); } diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll index 7b514c67d750..31f18e67cb2d 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll @@ -300,7 +300,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: .LBB0_1: // %outer.loop.header ; DCL-NEXT: // =>This Loop Header: Depth=1 ; DCL-NEXT: // Child Loop BB0_2 Depth 2 -; DCL-NEXT: vlda wl6, [p1], #32; nopb ; nopx +; DCL-NEXT: vlda wl6, [p1], #32; nopxm ; DCL-NEXT: vlda wl5, [p0], m6; mov r0, p0 ; DCL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] ; DCL-NEXT: vlda wh6, [p1], #32 @@ -310,7 +310,6 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vlda wl7, [p0], m6 ; DCL-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m1, p5 ; DCL-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m1 -; DCL-NEXT: vlda wh8, [p1], #32 ; DCL-NEXT: vlda.3d wh7, [p0], d0 ; DCL-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32] ; DCL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5 @@ -323,16 +322,16 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32] ; DCL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m5 ; DCL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32] -; DCL-NEXT: vlda wl10, [p1], #32 +; DCL-NEXT: vlda wh8, [p1], #32 ; DCL-NEXT: vlda wl3, [p0], m6; mov r1, p0 ; DCL-NEXT: vlda.ups.s32.s16 bml7, s0, [p2, #0]; and r0, r0, r9 ; DCL-NEXT: vlda wh3, [p0], m6; add r0, r0, #33 ; DCL-NEXT: vlda wl5, [p0], m6; vshift.align x4, x4, s1, x5, r0 ; DCL-NEXT: vlda.3d wh5, [p0], d0; and r10, r1, r9; vshift.align x2, x2, s1, x7, r0 -; DCL-NEXT: vlda wh10, [p1], #32; add r0, r10, #33; mov r10, p0 -; DCL-NEXT: vlda wl1, [p1], #32; add r1, r5, #-1; vshuffle x7, x4, x2, r2 -; DCL-NEXT: vlda wh1, [p1], #32; add r1, r1, #-1; vshuffle x9, x7, x0, r8 -; DCL-NEXT: and r10, r10, r9 +; DCL-NEXT: vlda wl10, [p1], #32; add r0, r10, #33; mov r10, p0 +; DCL-NEXT: vlda wh10, [p1], #32; add r1, r5, #-1; vshuffle x7, x4, x2, r2 +; DCL-NEXT: vlda wl1, [p1], #32; add r1, r1, #-1; vshuffle x9, x7, x0, r8 +; DCL-NEXT: vlda wh1, [p1], #32; and r10, r10, r9 ; DCL-NEXT: .p2align 4 ; DCL-NEXT: .LBB0_2: // %inner.loop ; DCL-NEXT: // Parent Loop BB0_1 Depth=1 @@ -340,14 +339,13 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 ; DCL-NEXT: nopa ; nopb ; nopx ; vshift.align x4, x4, s1, x3, r0; vmac cm5, cm5, x9, x8, r4 ; DCL-NEXT: vlda wl3, [p0], m6; vshift.align x2, x2, s1, x5, r0 -; DCL-NEXT: vlda wh3, [p0], m6; vshuffle x11, x9, x0, r8 -; DCL-NEXT: vlda wl5, [p0], m6; add r1, r1, #-1; vshuffle x7, x4, x2, r2; vmac cm0, cm0, x7, x6, r4 -; DCL-NEXT: vlda wl10, [p1], #32; jnz r1, #.LBB0_2; vmac cm4, cm4, x7, x8, r4 -; DCL-NEXT: vlda.3d wh5, [p0], d0; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 // Delay Slot 5 -; DCL-NEXT: vlda wh10, [p1], #32; vmov x6, x10; vmac cm6, cm6, x9, x8, r4 // Delay Slot 4 -; DCL-NEXT: vlda wl1, [p1], #32; vmov x8, x1; vmac cm3, cm3, x11, x6, r4 // Delay Slot 3 -; DCL-NEXT: vlda wh1, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm7, cm7, x11, x8, r4 // Delay Slot 2 -; DCL-NEXT: and r10, r10, r9 // Delay Slot 1 +; DCL-NEXT: vlda wh3, [p0], m6; add r1, r1, #-1; vshuffle x11, x9, x0, r8 +; DCL-NEXT: vlda wl5, [p0], m6; jnz r1, #.LBB0_2; vmac cm0, cm0, x7, x6, r4 +; DCL-NEXT: vlda.3d wh5, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 // Delay Slot 5 +; DCL-NEXT: vlda wl10, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 // Delay Slot 4 +; DCL-NEXT: vlda wh10, [p1], #32; vmov x6, x10; vmac cm6, cm6, x9, x8, r4 // Delay Slot 3 +; DCL-NEXT: vlda wl1, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2 +; DCL-NEXT: vlda wh1, [p1], #32; and r10, r10, r9; vmov x8, x1; vmac cm7, cm7, x11, x8, r4 // Delay Slot 1 ; DCL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 ; DCL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0 ; DCL-NEXT: vshuffle x0, x4, x2, r3 @@ -481,7 +479,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: .LBB0_1: // %outer.loop.header ; ZOL-NEXT: // =>This Loop Header: Depth=1 ; ZOL-NEXT: // Child Loop BB0_2 Depth 2 -; ZOL-NEXT: vlda wl6, [p1], #32; nopb ; nopx +; ZOL-NEXT: vlda wl6, [p1], #32; nopb ; nopxm ; ZOL-NEXT: vlda wl3, [p0], m6; mov r0, p0 ; ZOL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] ; ZOL-NEXT: vlda wh6, [p1], #32 @@ -495,7 +493,6 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: vlda.3d wh5, [p0], d0 ; ZOL-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32] ; ZOL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5 -; ZOL-NEXT: vlda wl10, [p1], #32 ; ZOL-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m2, r14 ; ZOL-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m2 ; ZOL-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32] @@ -504,32 +501,31 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m1 ; ZOL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32]; add.nc r1, r5, #-2 ; ZOL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m5; movxm ls, #.LBB0_2 -; ZOL-NEXT: vlda wh10, [p1], #32; movxm le, #.L_LEnd0 +; ZOL-NEXT: vlda wl10, [p1], #32; movxm le, #.L_LEnd0 ; ZOL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]; mov lc, r1 ; ZOL-NEXT: nopb ; vlda wl3, [p0], m6; nops ; nopx ; mov r1, p0; nopv ; ZOL-NEXT: nopb ; vlda wh3, [p0], m6; nops ; and r0, r0, r9; nopm ; nopv ; ZOL-NEXT: nopb ; vlda wl5, [p0], m6; nops ; add r0, r0, #33; nopm ; nopv ; ZOL-NEXT: nopb ; vlda.3d wh5, [p0], d0; nops ; nopx ; vshift.align x4, x4, s1, x3, r0; nopv ; ZOL-NEXT: nopb ; vlda.ups.s32.s16 bml7, s0, [p2, #0]; nops ; nopx ; vshift.align x2, x2, s1, x5, r0; nopv -; ZOL-NEXT: nopb ; vlda wl1, [p1], #32; nops ; and r1, r1, r9; vshuffle x7, x4, x2, r2; nopv -; ZOL-NEXT: nopb ; vlda wh1, [p1], #32; nops ; add r0, r1, #33; mov r1, p0; nopv -; ZOL-NEXT: nopa ; nopb ; nopx ; vshuffle x9, x7, x0, r8 +; ZOL-NEXT: nopb ; vlda wh10, [p1], #32; nops ; and r1, r1, r9; vshuffle x7, x4, x2, r2; nopv +; ZOL-NEXT: nopb ; vlda wl1, [p1], #32; nops ; add r0, r1, #33; mov r1, p0; nopv +; ZOL-NEXT: vlda wh1, [p1], #32; nopb ; nopx ; vshuffle x9, x7, x0, r8 ; ZOL-NEXT: and r1, r1, r9 ; ZOL-NEXT: .p2align 4 ; ZOL-NEXT: .LBB0_2: // %inner.loop ; ZOL-NEXT: // Parent Loop BB0_1 Depth=1 ; ZOL-NEXT: // => This Inner Loop Header: Depth=2 -; ZOL-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 -; ZOL-NEXT: vlda wl3, [p0], m6; nopx ; vshift.align x4, x4, s1, x3, r0; vmac cm5, cm5, x9, x8, r4 +; ZOL-NEXT: nopa ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 +; ZOL-NEXT: vlda wl3, [p0], m6; vshift.align x4, x4, s1, x3, r0; vmac cm5, cm5, x9, x8, r4 ; ZOL-NEXT: vlda wh3, [p0], m6; vshift.align x2, x2, s1, x5, r0 ; ZOL-NEXT: vlda wl5, [p0], m6; vshuffle x11, x9, x0, r8; vmac cm0, cm0, x7, x6, r4 -; ZOL-NEXT: vlda wl10, [p1], #32; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 -; ZOL-NEXT: vlda.3d wh5, [p0], d0; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 +; ZOL-NEXT: vlda.3d wh5, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 +; ZOL-NEXT: vlda wl10, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 ; ZOL-NEXT: vlda wh10, [p1], #32; vmov x6, x10; vmac cm6, cm6, x9, x8, r4 -; ZOL-NEXT: vlda wl1, [p1], #32; vmov x8, x1; vmac cm3, cm3, x11, x6, r4 -; ZOL-NEXT: vlda wh1, [p1], #32; add r0, r1, #33; mov r1, p0; vmac cm7, cm7, x11, x8, r4 +; ZOL-NEXT: vlda wl1, [p1], #32; add r0, r1, #33; mov r1, p0; vmac cm3, cm3, x11, x6, r4 ; ZOL-NEXT: .L_LEnd0: -; ZOL-NEXT: nopb ; nopa ; nops ; and r1, r1, r9; nopm ; nopv +; ZOL-NEXT: nopb ; vlda wh1, [p1], #32; nops ; and r1, r1, r9; vmov x8, x1; vmac cm7, cm7, x11, x8, r4 ; ZOL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 ; ZOL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0 ; ZOL-NEXT: vshuffle x0, x4, x2, r3 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir index 74765446d4f5..9211647b6e94 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir @@ -15,7 +15,6 @@ # This means that in the fixpoint loop for loop-aware-scheduling, one needs to # increase the safety margin for one instruction at a time: The VLDs need to be # pushed up, not the VST. -# FIXME: Actually do this. --- name: add2d alignment: 16 @@ -48,23 +47,26 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $cm0, $cm4, $dc0, $dc4, $dj0, $dj4, $dn0, $dn4, $m0, $m1, $p1, $p2, $p3, $r0, $r1, $s0, $s1, $d0_3d:0x000000000001C870 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $cm8 = VADD killed renamable $cm4, killed renamable $cm0, renamable $r0 - ; CHECK-NEXT: renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) - ; CHECK-NEXT: BUNDLE implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit-def $p2, implicit-def $dc0, implicit-def $dc4, implicit-def $srups_of, implicit-def $r1, implicit-def dead $srcarry, implicit $s1, implicit killed $p2, implicit $d0_3d, implicit $crsat, implicit $crupssign, implicit killed $r1 { - ; CHECK-NEXT: $cm4, $p2, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s1, killed $p2, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: BUNDLE implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit-def $p1, implicit-def $srups_of, implicit-def $r1, implicit-def dead $srcarry, implicit $s1, implicit killed $p1, implicit $m1, implicit $crsat, implicit $crupssign, implicit killed $r1 { + ; CHECK-NEXT: renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) ; CHECK-NEXT: renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry ; CHECK-NEXT: } - ; CHECK-NEXT: JNZ renamable $r1, %bb.1 + ; CHECK-NEXT: BUNDLE implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit-def $p2, implicit-def $dc0, implicit-def $dc4, implicit-def $srups_of, implicit-def $cm8, implicit-def $bml8, implicit-def $amll8, implicit-def $amlh8, implicit-def $bmh8, implicit-def $amhl8, implicit-def $amhh8, implicit $s1, implicit killed $p2, implicit $d0_3d, implicit $crsat, implicit $crupssign, implicit $r1, implicit $cm0, implicit $r0 { + ; CHECK-NEXT: $cm4, $p2, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s1, killed $p2, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: JNZ renamable $r1, %bb.1 + ; CHECK-NEXT: renamable $cm8 = VADD internal renamable $cm4, renamable $cm0, renamable $r0 + ; CHECK-NEXT: } ; CHECK-NEXT: NOP - ; CHECK-NEXT: renamable $p3 = VST_SRS_D8_S32_ag_pstm_nrm_imm killed renamable $p3, 32, killed renamable $cm8, renamable $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP + ; CHECK-NEXT: renamable $p3 = VST_SRS_D8_S32_ag_pstm_nrm_imm killed renamable $p3, 32, killed renamable $cm8, renamable $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: DelayedSchedBarrier ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: liveins: $cm0, $cm4, $p3, $r0, $s0 ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: NOP ; CHECK-NEXT: renamable $cm0 = VADD killed renamable $cm4, killed renamable $cm0, killed renamable $r0 ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/short-hwloop.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/short-hwloop.mir index 6f533ae34bfa..eb61b815734d 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/short-hwloop.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/short-hwloop.mir @@ -32,12 +32,11 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $p0, $r0, $r1, $r2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $r0 = ADD_NC_GPR $r1, 1 - ; CHECK-NEXT: BUNDLE implicit-def $p0, implicit-def $r1, implicit $r0, implicit killed $p0, implicit killed $r1 { - ; CHECK-NEXT: $p0 = ST_dms_sts_pstm_nrm_imm $r0, killed $p0, 4 + ; CHECK-NEXT: BUNDLE implicit-def $r1, implicit-def $r0, implicit killed $r1 { ; CHECK-NEXT: $r1 = MUL_mul_r_rr killed $r1, $r1 + ; CHECK-NEXT: $r0 = ADD_NC_GPR internal $r1, 1 ; CHECK-NEXT: } - ; CHECK-NEXT: NOP + ; CHECK-NEXT: $p0 = ST_dms_sts_pstm_nrm_imm $r0, killed $p0, 4 ; CHECK-NEXT: PseudoLoopEnd %bb.2, %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: From ac84dcc9487cd44ea424a83c2eef31654a39e63e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Mon, 5 Aug 2024 16:00:18 +0100 Subject: [PATCH 03/31] [AIE2] NFC: Add baseline test with critical CM reg pressure In a follow-up commit, the premisched will re-order the instructions to reduce the pressure and avoid spills during RA. --- .../AIE/aie2/schedule/pre_ra/add2d_inner.mir | 125 ++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir new file mode 100644 index 000000000000..ddf2c75b1e5e --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir @@ -0,0 +1,125 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -march=aie2 -run-pass=machine-scheduler %s -o - | FileCheck %s + + +# This represents the innermost loop of Add2D after SW pipelining. +# We should see most of the VLDA.UPS instructions move down in the loop +# BB to reduce the reg pressure and avoid spills. They can later be moved back +# up by the post-RA scheduler. This should also make the 4 acc1024 COPY +# instructions coalesce-able. +--- +name: add2d_innermost +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: add2d_innermost + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $m0, $cm0, $cm1, $s0, $d1, $x0, $r0, $d0_3d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:em = COPY $m0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:ep_as_32bit = COPY $p0 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:ep_as_32bit = COPY $p0 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = COPY $p0 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:eds = COPY $d0_3d + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:mss = COPY $s0 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:mss = COPY $s0 + ; CHECK-NEXT: PseudoJ_jump_imm %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:acc1024 = COPY [[COPY4]] + ; CHECK-NEXT: [[VADD:%[0-9]+]]:acc1024 = VADD [[COPY5]], [[COPY17]], [[COPY14]] + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:acc1024 = COPY [[COPY3]] + ; CHECK-NEXT: [[VADD1:%[0-9]+]]:acc1024 = VADD [[COPY6]], [[COPY18]], [[COPY14]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:acc1024 = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: [[VADD2:%[0-9]+]]:acc1024 = VADD [[COPY7]], [[COPY19]], [[COPY14]] + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:acc1024 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: [[VADD3:%[0-9]+]]:acc1024 = VADD [[COPY8]], [[COPY20]], [[COPY14]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD1]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD2]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD3]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:er = ADD_add_r_ri [[COPY13]], -4, implicit-def dead $srcarry + ; CHECK-NEXT: PseudoJNZ [[COPY13]], %bb.1 + ; CHECK-NEXT: PseudoJ_jump_imm %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr + + bb.0.entry: + liveins: $p0, $m0, $cm0, $cm1, $s0, $d1, $x0, $r0, $d0_3d + + %367:acc1024 = COPY $cm0 + %365:acc1024 = COPY $cm0 + %363:acc1024 = COPY $cm0 + %361:acc1024 = COPY $cm0 + %362:acc1024 = COPY $cm0 + %364:acc1024 = COPY $cm0 + %366:acc1024 = COPY $cm0 + %368:acc1024 = COPY $cm0 + %248:mss = COPY $s0 + %245:mss = COPY $s0 + %355:ep_as_32bit = COPY $p0 + %358:ep_as_32bit = COPY $p0 + %359:ep_as_32bit = COPY $p0 + %82:em = COPY $m0 + %272:eds = COPY $d0_3d + %360:er = COPY $r0 + %206:er = COPY $r0 + PseudoJ_jump_imm %bb.1 + + bb.1: + successors: %bb.2(0x04000000), %bb.1(0x7c000000) + + %327:acc1024 = COPY %367 + %325:acc1024 = COPY %365 + %323:acc1024 = COPY %363 + %321:acc1024 = COPY %361 + %361:acc1024, %355:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm %248, %355, %82, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + %360:er = ADD_add_r_ri %360, -4, implicit-def dead $srcarry + %281:acc1024 = VADD %362, %321, %206 + %363:acc1024, %355:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm %248, %355, %82, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + %284:acc1024 = VADD %364, %323, %206 + %365:acc1024, %355:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm %248, %355, %82, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + %287:acc1024 = VADD %366, %325, %206 + %367:acc1024, %355:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm %248, %355, %82, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + %362:acc1024, %358:ep_as_32bit, %272.sub_dim_count:eds, %272.sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 %248, %358, %272, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + %295:acc1024 = VADD %368, %327, %206 + %364:acc1024, %358:ep_as_32bit, %272.sub_dim_count:eds, %272.sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 %248, %358, %272, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + %366:acc1024, %358:ep_as_32bit, %272.sub_dim_count:eds, %272.sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 %248, %358, %272, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + %359:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm %359, 32, %281, %245, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + %359:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm %359, 32, %284, %245, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + %368:acc1024, %358:ep_as_32bit, %272.sub_dim_count:eds, %272.sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 %248, %358, %272, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + %359:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm %359, 32, %287, %245, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + %359:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm %359, 32, %295, %245, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + PseudoJNZ %360, %bb.1 + PseudoJ_jump_imm %bb.2 + + bb.2: + PseudoRET implicit $lr +... From 9a7a19871e71dad1decdbae177bedbbc443d1fbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Mon, 5 Aug 2024 17:07:42 +0100 Subject: [PATCH 04/31] [AIEX] Premisched: more conservative reg pressure reduction - Reserve a certain number of registers, not regunits - Be extra careful when the region max pressure exceeds limits --- llvm/lib/Target/AIE/AIEMachineScheduler.cpp | 54 +++-- llvm/lib/Target/AIE/AIEMachineScheduler.h | 4 + .../GlobalISel/legalize-dyn-stackalloc.ll | 31 ++- llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll | 31 ++- .../AIE/aie2/end-to-end/Conv2D-red-swp.ll | 212 +++++++++--------- .../CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll | 36 +-- .../test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll | 8 +- .../AIE/aie2/ra/tie-subregs-flow-3d.mir | 60 ++--- .../AIE/aie2/schedule/pre_ra/add2d_inner.mir | 12 +- .../AIE/aie2/schedule/pre_ra/conv2d_inner.mir | 12 +- .../aie2/schedule/pre_ra/reduce_pressure.mir | 94 ++++---- 11 files changed, 293 insertions(+), 261 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index 38d4776e2c79..40f95f4ab998 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -40,7 +40,7 @@ static cl::opt cl::desc("Track reg pressure more accurately and " "delay some instructions to avoid spills.")); static cl::opt NumCriticalFreeRegs( - "aie-premisched-near-critical-regs", cl::init(4), + "aie-premisched-near-critical-regs", cl::init(2), cl::desc("Number of free registers below which premisched should actively " "try to reduce the pressure.")); @@ -761,6 +761,33 @@ bool AIEPostRASchedStrategy::tryCandidate(SchedCandidate &Cand, return false; } +void AIEPreRASchedStrategy::initialize(ScheduleDAGMI *DAG) { + GenericScheduler::initialize(DAG); + + // Cache the threshold for each pressure set. + const std::vector &RegionMaxPressure = + static_cast(DAG)->getRegPressure().MaxSetPressure; + PSetThresholds.clear(); + for (unsigned PSet = 0, EndPSet = RegionMaxPressure.size(); PSet < EndPSet; + ++PSet) { + unsigned MaxPressure = RegionMaxPressure[PSet]; + unsigned Limit = Context->RegClassInfo->getRegPressureSetLimit(PSet); + + // If the region has a maximum pressure that exceeds the target threshold, + // artificially reduce that threshold to force more conservative scheduling. + if (MaxPressure > Limit) { + unsigned ExtraPressure = MaxPressure - Limit; + if (Limit > ExtraPressure) + Limit -= ExtraPressure; + else + Limit = 0; + LLVM_DEBUG(dbgs() << TRI->getRegPressureSetName(PSet) + << " Decreased Threshold to " << Limit << "\n"); + } + PSetThresholds.push_back(Limit); + } +} + void AIEPreRASchedStrategy::enterRegion(MachineBasicBlock *BB, MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, @@ -874,8 +901,9 @@ bool AIEPreRASchedStrategy::isAvailableNode(SUnit &SU, SchedBoundary &Zone, } unsigned CurrPressure = BotRPT.getRegSetPressureAtPos()[WorstPC.getPSet()]; - if (CurrPressure + WorstPC.getUnitInc() < - TRI->getRegPressureSetLimit(*CurMBB->getParent(), WorstPC.getPSet())) { + if (CurrPressure + WorstPC.getUnitInc() + + (NumCriticalFreeRegs * WorstPC.getUnitInc()) < + PSetThresholds[WorstPC.getPSet()]) { // Worsening pressure, but still within limits, keep node as available return true; } @@ -960,10 +988,11 @@ bool AIEPreRASchedStrategy::tryCandidate(SchedCandidate &Cand, if (!PC.isValid()) return false; unsigned CurrPressure = BotRPT.getRegSetPressureAtPos()[PC.getPSet()]; - unsigned Threshold = - TRI->getRegPressureSetLimit(*CurMBB->getParent(), PC.getPSet()); - return Threshold <= NumCriticalFreeRegs || - CurrPressure >= Threshold - NumCriticalFreeRegs; + unsigned Threshold = PSetThresholds[PC.getPSet()]; + unsigned NumCriticalFreeUnits = + NumCriticalFreeRegs * std::abs(PC.getUnitInc()); + return Threshold <= NumCriticalFreeUnits || + CurrPressure >= Threshold - NumCriticalFreeUnits; }; PressureChange TryCandPC = getPressureChange(estimatePressureDiff(*TryCand.SU, BotRPT)); @@ -972,13 +1001,12 @@ bool AIEPreRASchedStrategy::tryCandidate(SchedCandidate &Cand, if ((IsNearCritical(TryCandPC) || IsNearCritical(CandPC)) && tryPressure(TryCandPC, CandPC, TryCand, Cand, RegMax, TRI, DAG->MF)) return TryCand.Reason != NoCand; - } - // Avoid increasing the max pressure of the entire region. - if (DAG->isTrackingPressure() && - tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, - Cand, RegMax, TRI, DAG->MF)) - return TryCand.Reason != NoCand; + // Avoid increasing the max pressure of the entire region. + if (tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, + TryCand, Cand, RegMax, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + } // Fall through to original instruction order. if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.h b/llvm/lib/Target/AIE/AIEMachineScheduler.h index da184abf12c7..b2f68a07129b 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.h +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.h @@ -151,6 +151,8 @@ class AIEPreRASchedStrategy : public GenericScheduler { public: AIEPreRASchedStrategy(const MachineSchedContext *C) : GenericScheduler(C) {} + void initialize(ScheduleDAGMI *DAG) override; + void enterRegion(MachineBasicBlock *BB, MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, unsigned RegionInstrs); void leaveRegion(const SUnit &ExitSU); @@ -182,6 +184,8 @@ class AIEPreRASchedStrategy : public GenericScheduler { /// pressure-reducing SU to be scheduled first. /// SUDelayerMap[0] = 2 means that SU(0) is waiting on SU(2). std::vector SUDelayerMap; + + std::vector PSetThresholds; }; /// An extension to ScheduleDAGMI that provides callbacks on region entry/exit diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll b/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll index f67ae5792250..2915bbb173ef 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll @@ -150,34 +150,33 @@ define void @test_huge_stack(i32 noundef %n) #0 { ; CHECK-NEXT: mov p2, p7 ; CHECK-NEXT: mov p6, p7 ; CHECK-NEXT: paddb [p0], m0 -; CHECK-NEXT: paddb [p2], #-32 +; CHECK-NEXT: paddb [p6], #-32 +; CHECK-NEXT: movxm m0, #-40032 ; CHECK-NEXT: st r0, [p0, #0] ; CHECK-NEXT: lda r0, [p0, #0] -; CHECK-NEXT: mov r16, p2 +; CHECK-NEXT: paddb [p2], m0 ; CHECK-NEXT: mov p0, sp -; CHECK-NEXT: st p0, [p2, #0] -; CHECK-NEXT: mov p0, p1 -; CHECK-NEXT: mov p2, p7 -; CHECK-NEXT: paddb [p2], #-24 +; CHECK-NEXT: mov r16, p2 +; CHECK-NEXT: st p0, [p6, #0] +; CHECK-NEXT: mov p0, p7 +; CHECK-NEXT: paddb [p0], #-24 ; CHECK-NEXT: lshl r2, r0, r2 -; CHECK-NEXT: st r0, [p2], #4 +; CHECK-NEXT: st r0, [p0], #4 ; CHECK-NEXT: add r2, r2, #31 -; CHECK-NEXT: st r1, [p2, #0] -; CHECK-NEXT: and r2, r2, r3 +; CHECK-NEXT: st r1, [p0, #0] ; CHECK-NEXT: jl #extern_call -; CHECK-NEXT: mov m0, r2 // Delay Slot 5 -; CHECK-NEXT: paddb [p1], m0 // Delay Slot 4 -; CHECK-NEXT: movxm m0, #-40032 // Delay Slot 3 -; CHECK-NEXT: paddb [p6], m0 // Delay Slot 2 +; CHECK-NEXT: mov p0, p1 // Delay Slot 5 +; CHECK-NEXT: and r2, r2, r3 // Delay Slot 4 +; CHECK-NEXT: mov m0, r2 // Delay Slot 3 +; CHECK-NEXT: paddb [p1], m0 // Delay Slot 2 ; CHECK-NEXT: mov sp, p1 // Delay Slot 1 ; CHECK-NEXT: nopb ; nopa ; nops ; jl #extern_call; nopv ; CHECK-NEXT: nopa ; nopx // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: mov p0, p6 // Delay Slot 1 -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov p0, r16; nopv -; CHECK-NEXT: lda p0, [p0, #0]; nopx +; CHECK-NEXT: mov p0, r16 // Delay Slot 1 +; CHECK-NEXT: lda p0, [p6, #0]; nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll b/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll index 09ea5c39ff53..337fecd1e4bd 100644 --- a/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll +++ b/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll @@ -150,34 +150,33 @@ define void @test_huge_stack(i32 noundef %n) #0 { ; CHECK-NEXT: mov p2, p7 ; CHECK-NEXT: mov p6, p7 ; CHECK-NEXT: paddb [p0], m0 -; CHECK-NEXT: paddb [p2], #-32 +; CHECK-NEXT: paddb [p6], #-32 +; CHECK-NEXT: movxm m0, #-40032 ; CHECK-NEXT: st r0, [p0, #0] ; CHECK-NEXT: lda r0, [p0, #0] -; CHECK-NEXT: mov r16, p2 +; CHECK-NEXT: paddb [p2], m0 ; CHECK-NEXT: mov p0, sp -; CHECK-NEXT: st p0, [p2, #0] -; CHECK-NEXT: mov p0, p1 -; CHECK-NEXT: mov p2, p7 -; CHECK-NEXT: paddb [p2], #-24 +; CHECK-NEXT: mov r16, p2 +; CHECK-NEXT: st p0, [p6, #0] +; CHECK-NEXT: mov p0, p7 +; CHECK-NEXT: paddb [p0], #-24 ; CHECK-NEXT: lshl r2, r0, r2 -; CHECK-NEXT: st r0, [p2], #4 +; CHECK-NEXT: st r0, [p0], #4 ; CHECK-NEXT: add r2, r2, #31 -; CHECK-NEXT: st r1, [p2, #0] -; CHECK-NEXT: and r2, r2, r3 +; CHECK-NEXT: st r1, [p0, #0] ; CHECK-NEXT: jl #extern_call -; CHECK-NEXT: mov m0, r2 // Delay Slot 5 -; CHECK-NEXT: paddb [p1], m0 // Delay Slot 4 -; CHECK-NEXT: movxm m0, #-40032 // Delay Slot 3 -; CHECK-NEXT: paddb [p6], m0 // Delay Slot 2 +; CHECK-NEXT: mov p0, p1 // Delay Slot 5 +; CHECK-NEXT: and r2, r2, r3 // Delay Slot 4 +; CHECK-NEXT: mov m0, r2 // Delay Slot 3 +; CHECK-NEXT: paddb [p1], m0 // Delay Slot 2 ; CHECK-NEXT: mov sp, p1 // Delay Slot 1 ; CHECK-NEXT: nopb ; nopa ; nops ; jl #extern_call; nopv ; CHECK-NEXT: nopa ; nopx // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: mov p0, p6 // Delay Slot 1 -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov p0, r16; nopv -; CHECK-NEXT: lda p0, [p0, #0]; nopx +; CHECK-NEXT: mov p0, r16 // Delay Slot 1 +; CHECK-NEXT: lda p0, [p6, #0]; nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll index 31f18e67cb2d..fc78d7e13404 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll @@ -238,50 +238,50 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: mov p7, sp ; DCL-NEXT: paddb [p7], #-272; st p6, [sp, #-188] // 4-byte Folded Spill ; DCL-NEXT: lda r25, [p7, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-204; mov dc3, dj3 +; DCL-NEXT: paddb [p6], #-292; mov dc3, dj3 ; DCL-NEXT: lda m0, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-208; mov r28, dj3 +; DCL-NEXT: paddb [p6], #-296; mov r28, dj3 ; DCL-NEXT: lda dj0, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-212; mov dc7, dj3 -; DCL-NEXT: lda dj4, [p6, #0] +; DCL-NEXT: paddb [p6], #-300; mov dc7, dj3 +; DCL-NEXT: lda dn0, [p6, #0] ; DCL-NEXT: mov p6, sp -; DCL-NEXT: paddb [p6], #-216 -; DCL-NEXT: lda dn0, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-220 +; DCL-NEXT: paddb [p6], #-204 +; DCL-NEXT: lda m0, [p6, #0]; mov p6, sp +; DCL-NEXT: paddb [p6], #-208 +; DCL-NEXT: lda dj0, [p6, #0]; mov p6, sp +; DCL-NEXT: paddb [p6], #-212 +; DCL-NEXT: lda dj4, [p6, #0]; mov p6, sp +; DCL-NEXT: paddb [p6], #-216; mov p7, sp +; DCL-NEXT: lda dn0, [p6, #0]; st m0, [sp, #-96] // 4-byte Folded Spill +; DCL-NEXT: paddb [p7], #-200; mov p6, sp +; DCL-NEXT: lda m6, [p7, #0]; paddb [p6], #-220; st dj0, [sp, #-88] // 4-byte Folded Spill ; DCL-NEXT: lda dn4, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-228 ; DCL-NEXT: lda r11, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-232; mov p7, sp -; DCL-NEXT: lda dj1, [p6, #0] -; DCL-NEXT: paddb [p7], #-200; mov p6, sp -; DCL-NEXT: lda m6, [p7, #0]; paddb [p6], #-236 -; DCL-NEXT: lda r12, [p6, #0]; mov p6, sp +; DCL-NEXT: paddb [p6], #-232; st dn0, [sp, #-92] // 4-byte Folded Spill +; DCL-NEXT: lda dj1, [p6, #0]; mov p6, sp +; DCL-NEXT: paddb [p6], #-236 +; DCL-NEXT: lda r12, [p6, #0] +; DCL-NEXT: mov p6, sp ; DCL-NEXT: paddb [p6], #-240 ; DCL-NEXT: lda dn1, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-244 ; DCL-NEXT: lda dn5, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-292 -; DCL-NEXT: lda m2, [p6, #0] -; DCL-NEXT: mov p6, sp -; DCL-NEXT: paddb [p6], #-296 -; DCL-NEXT: lda dj2, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-300 -; DCL-NEXT: lda dn2, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-248 ; DCL-NEXT: lda r13, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-252; mov p7, sp -; DCL-NEXT: lda dj2, [p6, #0]; st m2, [sp, #-96] // 4-byte Folded Spill +; DCL-NEXT: lda dj2, [p6, #0] ; DCL-NEXT: mov p6, sp ; DCL-NEXT: lda m7, [sp, #-96]; paddb [p6], #-256 // 4-byte Folded Reload ; DCL-NEXT: lda dj6, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-260 ; DCL-NEXT: lda dn2, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-264; st dj2, [sp, #-88] // 4-byte Folded Spill +; DCL-NEXT: paddb [p6], #-264 ; DCL-NEXT: lda dn6, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-268 ; DCL-NEXT: lda r14, [p6, #0] ; DCL-NEXT: mov p6, sp -; DCL-NEXT: lda dj7, [sp, #-88]; paddb [p6], #-276; st dn2, [sp, #-92] // 4-byte Folded Reload4-byte Folded Spill +; DCL-NEXT: lda dj7, [sp, #-88]; paddb [p6], #-276 // 4-byte Folded Reload ; DCL-NEXT: lda dn3, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-280 ; DCL-NEXT: lda r26, [p6, #0]; mov p6, sp @@ -301,10 +301,10 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: // =>This Loop Header: Depth=1 ; DCL-NEXT: // Child Loop BB0_2 Depth 2 ; DCL-NEXT: vlda wl6, [p1], #32; nopxm -; DCL-NEXT: vlda wl5, [p0], m6; mov r0, p0 +; DCL-NEXT: vlda wl3, [p0], m6; mov r0, p0 ; DCL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] ; DCL-NEXT: vlda wh6, [p1], #32 -; DCL-NEXT: vlda wh5, [p0], m6; mov m5, p4 +; DCL-NEXT: vlda wh3, [p0], m6; mov m5, p4 ; DCL-NEXT: vlda.ups.s32.s16 bml0, s0, [p2], m5 ; DCL-NEXT: vlda wl8, [p1], #32 ; DCL-NEXT: vlda wl7, [p0], m6 @@ -323,29 +323,29 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m5 ; DCL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32] ; DCL-NEXT: vlda wh8, [p1], #32 -; DCL-NEXT: vlda wl3, [p0], m6; mov r1, p0 +; DCL-NEXT: vlda wl5, [p0], m6; mov r1, p0 ; DCL-NEXT: vlda.ups.s32.s16 bml7, s0, [p2, #0]; and r0, r0, r9 -; DCL-NEXT: vlda wh3, [p0], m6; add r0, r0, #33 -; DCL-NEXT: vlda wl5, [p0], m6; vshift.align x4, x4, s1, x5, r0 -; DCL-NEXT: vlda.3d wh5, [p0], d0; and r10, r1, r9; vshift.align x2, x2, s1, x7, r0 -; DCL-NEXT: vlda wl10, [p1], #32; add r0, r10, #33; mov r10, p0 -; DCL-NEXT: vlda wh10, [p1], #32; add r1, r5, #-1; vshuffle x7, x4, x2, r2 -; DCL-NEXT: vlda wl1, [p1], #32; add r1, r1, #-1; vshuffle x9, x7, x0, r8 -; DCL-NEXT: vlda wh1, [p1], #32; and r10, r10, r9 +; DCL-NEXT: vlda wh5, [p0], m6; add r0, r0, #33 +; DCL-NEXT: vlda wl3, [p0], m6; vshift.align x4, x4, s1, x3, r0 +; DCL-NEXT: vlda.3d wh3, [p0], d0; and r10, r1, r9; vshift.align x2, x2, s1, x7, r0 +; DCL-NEXT: vlda wl1, [p1], #32; add r0, r10, #33; mov r10, p0 +; DCL-NEXT: vlda wh1, [p1], #32; add r1, r5, #-1; vshuffle x7, x4, x2, r2 +; DCL-NEXT: vlda wl10, [p1], #32; add r1, r1, #-1; vshuffle x9, x7, x0, r8 +; DCL-NEXT: vlda wh10, [p1], #32; and r10, r10, r9 ; DCL-NEXT: .p2align 4 ; DCL-NEXT: .LBB0_2: // %inner.loop ; DCL-NEXT: // Parent Loop BB0_1 Depth=1 ; DCL-NEXT: // => This Inner Loop Header: Depth=2 ; DCL-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 -; DCL-NEXT: nopa ; nopb ; nopx ; vshift.align x4, x4, s1, x3, r0; vmac cm5, cm5, x9, x8, r4 -; DCL-NEXT: vlda wl3, [p0], m6; vshift.align x2, x2, s1, x5, r0 -; DCL-NEXT: vlda wh3, [p0], m6; add r1, r1, #-1; vshuffle x11, x9, x0, r8 -; DCL-NEXT: vlda wl5, [p0], m6; jnz r1, #.LBB0_2; vmac cm0, cm0, x7, x6, r4 -; DCL-NEXT: vlda.3d wh5, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 // Delay Slot 5 -; DCL-NEXT: vlda wl10, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 // Delay Slot 4 -; DCL-NEXT: vlda wh10, [p1], #32; vmov x6, x10; vmac cm6, cm6, x9, x8, r4 // Delay Slot 3 -; DCL-NEXT: vlda wl1, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2 -; DCL-NEXT: vlda wh1, [p1], #32; and r10, r10, r9; vmov x8, x1; vmac cm7, cm7, x11, x8, r4 // Delay Slot 1 +; DCL-NEXT: nopa ; nopb ; nopx ; vshift.align x4, x4, s1, x5, r0; vmac cm5, cm5, x9, x8, r4 +; DCL-NEXT: vlda wl5, [p0], m6; vshift.align x2, x2, s1, x3, r0 +; DCL-NEXT: vlda wh5, [p0], m6; add r1, r1, #-1; vshuffle x11, x9, x0, r8 +; DCL-NEXT: vlda wl3, [p0], m6; jnz r1, #.LBB0_2; vmac cm0, cm0, x7, x6, r4 +; DCL-NEXT: vlda.3d wh3, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 // Delay Slot 5 +; DCL-NEXT: vlda wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 // Delay Slot 4 +; DCL-NEXT: vlda wh1, [p1], #32; vmov x6, x1; vmac cm6, cm6, x9, x8, r4 // Delay Slot 3 +; DCL-NEXT: vlda wl10, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2 +; DCL-NEXT: vlda wh10, [p1], #32; and r10, r10, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 // Delay Slot 1 ; DCL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 ; DCL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0 ; DCL-NEXT: vshuffle x0, x4, x2, r3 @@ -359,25 +359,25 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vlda wh0, [sp, #-32]; vmac cm2, cm2, x0, x6, r4 // 32-byte Folded Reload ; DCL-NEXT: lda dn7, [sp, #-92]; vmac cm5, cm6, x0, x8, r4 // 4-byte Folded Reload ; DCL-NEXT: vmac cm4, cm5, x9, x8, r4 -; DCL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x3, r0; vmac cm8, cm4, x7, x8, r4 // 4-byte Folded Reload -; DCL-NEXT: vshift.align x2, x2, s1, x5, r0; vmac cm3, cm3, x11, x6, r4 +; DCL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm8, cm4, x7, x8, r4 // 4-byte Folded Reload +; DCL-NEXT: vshift.align x2, x2, s1, x3, r0; vmac cm3, cm3, x11, x6, r4 ; DCL-NEXT: vshuffle x6, x4, x2, r2 ; DCL-NEXT: vmac cm6, cm7, x6, x8, r4 -; DCL-NEXT: vshuffle x8, x6, x0, r8; vmac cm7, cm0, x6, x10, r4 +; DCL-NEXT: vshuffle x8, x6, x0, r8; vmac cm7, cm0, x6, x1, r4 ; DCL-NEXT: st dn7, [sp, #-92] // 4-byte Folded Spill -; DCL-NEXT: vshuffle x3, x4, x2, r3; vmac cm0, cm1, x8, x10, r4 +; DCL-NEXT: vshuffle x3, x4, x2, r3; vmac cm0, cm1, x8, x1, r4 ; DCL-NEXT: st dj7, [sp, #-88] // 4-byte Folded Spill -; DCL-NEXT: vshuffle x5, x3, x0, r8; vmac cm1, cm2, x3, x10, r4 +; DCL-NEXT: vshuffle x5, x3, x0, r8; vmac cm1, cm2, x3, x1, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh7, s2, [p3, #32]; mov s3, r6 -; DCL-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x10, r4 +; DCL-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x1, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32] -; DCL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x1, r4 +; DCL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x10, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh1, s3, [p3, #32] -; DCL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bml1, s3, [p3], #64; vmac cm8, cm4, x8, x1, r4 // 4-byte Folded Reload +; DCL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bml1, s3, [p3], #64; vmac cm8, cm4, x8, x10, r4 // 4-byte Folded Reload ; DCL-NEXT: lda dc7, [sp, #-84]; vst.srs.s16.s32 bmh2, s3, [p3, #32]; mov m1, r27 // 4-byte Folded Reload -; DCL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x3, x1, r4 +; DCL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x3, x10, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32]; mov dj5, r12 -; DCL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; mov m2, r13; vmac cm4, cm6, x5, x1, r4 +; DCL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; mov m2, r13; vmac cm4, cm6, x5, x10, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh8, s3, [p3, #32]; mov m3, r14 ; DCL-NEXT: vst.srs.s16.s32 bml8, s3, [p3], m4 ; DCL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32] @@ -417,50 +417,50 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: mov p7, sp ; ZOL-NEXT: paddb [p7], #-272; st p6, [sp, #-188] // 4-byte Folded Spill ; ZOL-NEXT: lda r24, [p7, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-204; mov dc3, dj3 +; ZOL-NEXT: paddb [p6], #-292; mov dc3, dj3 ; ZOL-NEXT: lda m0, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-208; mov r27, dj3 +; ZOL-NEXT: paddb [p6], #-296; mov r27, dj3 ; ZOL-NEXT: lda dj0, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-212; mov dc7, dj3 -; ZOL-NEXT: lda dj4, [p6, #0] +; ZOL-NEXT: paddb [p6], #-300; mov dc7, dj3 +; ZOL-NEXT: lda dn0, [p6, #0] ; ZOL-NEXT: mov p6, sp -; ZOL-NEXT: paddb [p6], #-216 -; ZOL-NEXT: lda dn0, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-220 +; ZOL-NEXT: paddb [p6], #-204 +; ZOL-NEXT: lda m0, [p6, #0]; mov p6, sp +; ZOL-NEXT: paddb [p6], #-208 +; ZOL-NEXT: lda dj0, [p6, #0]; mov p6, sp +; ZOL-NEXT: paddb [p6], #-212 +; ZOL-NEXT: lda dj4, [p6, #0]; mov p6, sp +; ZOL-NEXT: paddb [p6], #-216; mov p7, sp +; ZOL-NEXT: lda dn0, [p6, #0]; st m0, [sp, #-96] // 4-byte Folded Spill +; ZOL-NEXT: paddb [p7], #-200; mov p6, sp +; ZOL-NEXT: lda m6, [p7, #0]; paddb [p6], #-220; st dj0, [sp, #-88] // 4-byte Folded Spill ; ZOL-NEXT: lda dn4, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-228 ; ZOL-NEXT: lda r10, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-232; mov p7, sp -; ZOL-NEXT: lda dj1, [p6, #0] -; ZOL-NEXT: paddb [p7], #-200; mov p6, sp -; ZOL-NEXT: lda m6, [p7, #0]; paddb [p6], #-236 -; ZOL-NEXT: lda r11, [p6, #0]; mov p6, sp +; ZOL-NEXT: paddb [p6], #-232; st dn0, [sp, #-92] // 4-byte Folded Spill +; ZOL-NEXT: lda dj1, [p6, #0]; mov p6, sp +; ZOL-NEXT: paddb [p6], #-236 +; ZOL-NEXT: lda r11, [p6, #0] +; ZOL-NEXT: mov p6, sp ; ZOL-NEXT: paddb [p6], #-240 ; ZOL-NEXT: lda dn1, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-244 ; ZOL-NEXT: lda dn5, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-292 -; ZOL-NEXT: lda m2, [p6, #0] -; ZOL-NEXT: mov p6, sp -; ZOL-NEXT: paddb [p6], #-296 -; ZOL-NEXT: lda dj2, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-300 -; ZOL-NEXT: lda dn2, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-248 ; ZOL-NEXT: lda r12, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-252; mov p7, sp -; ZOL-NEXT: lda dj2, [p6, #0]; st m2, [sp, #-96] // 4-byte Folded Spill +; ZOL-NEXT: lda dj2, [p6, #0] ; ZOL-NEXT: mov p6, sp ; ZOL-NEXT: lda m7, [sp, #-96]; paddb [p6], #-256 // 4-byte Folded Reload ; ZOL-NEXT: lda dj6, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-260 ; ZOL-NEXT: lda dn2, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-264; st dj2, [sp, #-88] // 4-byte Folded Spill +; ZOL-NEXT: paddb [p6], #-264 ; ZOL-NEXT: lda dn6, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-268 ; ZOL-NEXT: lda r13, [p6, #0] ; ZOL-NEXT: mov p6, sp -; ZOL-NEXT: lda dj7, [sp, #-88]; paddb [p6], #-276; st dn2, [sp, #-92] // 4-byte Folded Reload4-byte Folded Spill +; ZOL-NEXT: lda dj7, [sp, #-88]; paddb [p6], #-276 // 4-byte Folded Reload ; ZOL-NEXT: lda dn3, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-280 ; ZOL-NEXT: lda r25, [p6, #0]; mov p6, sp @@ -479,53 +479,53 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: .LBB0_1: // %outer.loop.header ; ZOL-NEXT: // =>This Loop Header: Depth=1 ; ZOL-NEXT: // Child Loop BB0_2 Depth 2 -; ZOL-NEXT: vlda wl6, [p1], #32; nopb ; nopxm +; ZOL-NEXT: vlda wl6, [p1], #32; nopx ; ZOL-NEXT: vlda wl3, [p0], m6; mov r0, p0 ; ZOL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] ; ZOL-NEXT: vlda wh6, [p1], #32 ; ZOL-NEXT: vlda wh3, [p0], m6; mov m5, p4 ; ZOL-NEXT: vlda.ups.s32.s16 bml0, s0, [p2], m5 ; ZOL-NEXT: vlda wl8, [p1], #32 -; ZOL-NEXT: vlda wl5, [p0], m6 +; ZOL-NEXT: vlda wl7, [p0], m6 ; ZOL-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m1, p5 ; ZOL-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m1 ; ZOL-NEXT: vlda wh8, [p1], #32 -; ZOL-NEXT: vlda.3d wh5, [p0], d0 +; ZOL-NEXT: vlda.3d wh7, [p0], d0 ; ZOL-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32] ; ZOL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5 +; ZOL-NEXT: vlda wl1, [p1], #32 ; ZOL-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m2, r14 ; ZOL-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m2 ; ZOL-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32] ; ZOL-NEXT: vlda.ups.s32.s16 bml4, s0, [p2], m5 ; ZOL-NEXT: vlda.ups.s32.s16 bmh5, s0, [p2, #32] -; ZOL-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m1 +; ZOL-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m1; movxm ls, #.LBB0_2 ; ZOL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32]; add.nc r1, r5, #-2 -; ZOL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m5; movxm ls, #.LBB0_2 -; ZOL-NEXT: vlda wl10, [p1], #32; movxm le, #.L_LEnd0 -; ZOL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]; mov lc, r1 -; ZOL-NEXT: nopb ; vlda wl3, [p0], m6; nops ; nopx ; mov r1, p0; nopv -; ZOL-NEXT: nopb ; vlda wh3, [p0], m6; nops ; and r0, r0, r9; nopm ; nopv -; ZOL-NEXT: nopb ; vlda wl5, [p0], m6; nops ; add r0, r0, #33; nopm ; nopv -; ZOL-NEXT: nopb ; vlda.3d wh5, [p0], d0; nops ; nopx ; vshift.align x4, x4, s1, x3, r0; nopv -; ZOL-NEXT: nopb ; vlda.ups.s32.s16 bml7, s0, [p2, #0]; nops ; nopx ; vshift.align x2, x2, s1, x5, r0; nopv -; ZOL-NEXT: nopb ; vlda wh10, [p1], #32; nops ; and r1, r1, r9; vshuffle x7, x4, x2, r2; nopv -; ZOL-NEXT: nopb ; vlda wl1, [p1], #32; nops ; add r0, r1, #33; mov r1, p0; nopv -; ZOL-NEXT: vlda wh1, [p1], #32; nopb ; nopx ; vshuffle x9, x7, x0, r8 +; ZOL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m5; mov lc, r1 +; ZOL-NEXT: vlda wl5, [p0], m6; mov r1, p0 +; ZOL-NEXT: vlda wh5, [p0], m6; movxm le, #.L_LEnd0 +; ZOL-NEXT: nopb ; vlda.ups.s32.s16 bmh7, s0, [p2, #32]; nops ; and r0, r0, r9; nopm ; nopv +; ZOL-NEXT: nopb ; vlda wl3, [p0], m6; nops ; add r0, r0, #33; nopm ; nopv +; ZOL-NEXT: nopb ; vlda.3d wh3, [p0], d0; nops ; nopx ; vshift.align x4, x4, s1, x3, r0; nopv +; ZOL-NEXT: nopb ; vlda.ups.s32.s16 bml7, s0, [p2, #0]; nops ; and r1, r1, r9; vshift.align x2, x2, s1, x7, r0; nopv +; ZOL-NEXT: nopb ; vlda wh1, [p1], #32; nops ; add r0, r1, #33; mov r1, p0; nopv +; ZOL-NEXT: nopb ; vlda wl10, [p1], #32; nops ; nopx ; vshuffle x7, x4, x2, r2; nopv +; ZOL-NEXT: vlda wh10, [p1], #32; nopb ; nopx ; vshuffle x9, x7, x0, r8 ; ZOL-NEXT: and r1, r1, r9 ; ZOL-NEXT: .p2align 4 ; ZOL-NEXT: .LBB0_2: // %inner.loop ; ZOL-NEXT: // Parent Loop BB0_1 Depth=1 ; ZOL-NEXT: // => This Inner Loop Header: Depth=2 ; ZOL-NEXT: nopa ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 -; ZOL-NEXT: vlda wl3, [p0], m6; vshift.align x4, x4, s1, x3, r0; vmac cm5, cm5, x9, x8, r4 -; ZOL-NEXT: vlda wh3, [p0], m6; vshift.align x2, x2, s1, x5, r0 -; ZOL-NEXT: vlda wl5, [p0], m6; vshuffle x11, x9, x0, r8; vmac cm0, cm0, x7, x6, r4 -; ZOL-NEXT: vlda.3d wh5, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 -; ZOL-NEXT: vlda wl10, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 -; ZOL-NEXT: vlda wh10, [p1], #32; vmov x6, x10; vmac cm6, cm6, x9, x8, r4 -; ZOL-NEXT: vlda wl1, [p1], #32; add r0, r1, #33; mov r1, p0; vmac cm3, cm3, x11, x6, r4 +; ZOL-NEXT: vlda wl5, [p0], m6; vshift.align x4, x4, s1, x5, r0; vmac cm5, cm5, x9, x8, r4 +; ZOL-NEXT: vlda wh5, [p0], m6; vshift.align x2, x2, s1, x3, r0 +; ZOL-NEXT: vlda wl3, [p0], m6; vshuffle x11, x9, x0, r8; vmac cm0, cm0, x7, x6, r4 +; ZOL-NEXT: vlda.3d wh3, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 +; ZOL-NEXT: vlda wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 +; ZOL-NEXT: vlda wh1, [p1], #32; vmov x6, x1; vmac cm6, cm6, x9, x8, r4 +; ZOL-NEXT: vlda wl10, [p1], #32; add r0, r1, #33; mov r1, p0; vmac cm3, cm3, x11, x6, r4 ; ZOL-NEXT: .L_LEnd0: -; ZOL-NEXT: nopb ; vlda wh1, [p1], #32; nops ; and r1, r1, r9; vmov x8, x1; vmac cm7, cm7, x11, x8, r4 +; ZOL-NEXT: nopb ; vlda wh10, [p1], #32; nops ; and r1, r1, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 ; ZOL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 ; ZOL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0 ; ZOL-NEXT: vshuffle x0, x4, x2, r3 @@ -539,25 +539,25 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: vlda wh0, [sp, #-32]; vmac cm2, cm2, x0, x6, r4 // 32-byte Folded Reload ; ZOL-NEXT: lda dn7, [sp, #-92]; vmac cm5, cm6, x0, x8, r4 // 4-byte Folded Reload ; ZOL-NEXT: vmac cm4, cm5, x9, x8, r4 -; ZOL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x3, r0; vmac cm8, cm4, x7, x8, r4 // 4-byte Folded Reload -; ZOL-NEXT: vshift.align x2, x2, s1, x5, r0; vmac cm3, cm3, x11, x6, r4 +; ZOL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm8, cm4, x7, x8, r4 // 4-byte Folded Reload +; ZOL-NEXT: vshift.align x2, x2, s1, x3, r0; vmac cm3, cm3, x11, x6, r4 ; ZOL-NEXT: vshuffle x6, x4, x2, r2 ; ZOL-NEXT: vmac cm6, cm7, x6, x8, r4 -; ZOL-NEXT: vshuffle x8, x6, x0, r8; vmac cm7, cm0, x6, x10, r4 +; ZOL-NEXT: vshuffle x8, x6, x0, r8; vmac cm7, cm0, x6, x1, r4 ; ZOL-NEXT: st dn7, [sp, #-92] // 4-byte Folded Spill -; ZOL-NEXT: vshuffle x3, x4, x2, r3; vmac cm0, cm1, x8, x10, r4 +; ZOL-NEXT: vshuffle x3, x4, x2, r3; vmac cm0, cm1, x8, x1, r4 ; ZOL-NEXT: st dj7, [sp, #-88] // 4-byte Folded Spill -; ZOL-NEXT: vshuffle x5, x3, x0, r8; vmac cm1, cm2, x3, x10, r4 +; ZOL-NEXT: vshuffle x5, x3, x0, r8; vmac cm1, cm2, x3, x1, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh7, s2, [p3, #32]; mov s3, r6 -; ZOL-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x10, r4 +; ZOL-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x1, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32] -; ZOL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x1, r4 +; ZOL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x10, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh1, s3, [p3, #32] -; ZOL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bml1, s3, [p3], #64; vmac cm8, cm4, x8, x1, r4 // 4-byte Folded Reload +; ZOL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bml1, s3, [p3], #64; vmac cm8, cm4, x8, x10, r4 // 4-byte Folded Reload ; ZOL-NEXT: lda dc7, [sp, #-84]; vst.srs.s16.s32 bmh2, s3, [p3, #32]; mov m1, r26 // 4-byte Folded Reload -; ZOL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x3, x1, r4 +; ZOL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x3, x10, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32]; mov dj5, r11 -; ZOL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; mov m2, r12; vmac cm4, cm6, x5, x1, r4 +; ZOL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; mov m2, r12; vmac cm4, cm6, x5, x10, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh8, s3, [p3, #32]; mov m3, r13 ; ZOL-NEXT: vst.srs.s16.s32 bml8, s3, [p3], m4 ; ZOL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32] diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll index eac8073544b8..ece27639a0ac 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll @@ -45,35 +45,35 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ASM-NEXT: mov dc0, dj3 ; ASM-NEXT: st p6, [sp, #-28] // 4-byte Folded Spill ; ASM-NEXT: mov p6, sp -; ASM-NEXT: paddb [p6], #-44; mov dc4, dj3 +; ASM-NEXT: paddb [p6], #-132; mov dc4, dj3 +; ASM-NEXT: lda m5, [p6, #0]; mov p6, sp +; ASM-NEXT: paddb [p6], #-136; mov dc1, dj3 +; ASM-NEXT: lda r28, [p6, #0]; mov r25, dj3 +; ASM-NEXT: mov p6, sp +; ASM-NEXT: paddb [p6], #-140; mov dc2, dj3 +; ASM-NEXT: lda r27, [p6, #0]; mov p6, sp +; ASM-NEXT: paddb [p6], #-44 ; ASM-NEXT: lda m0, [p6, #0]; mov p6, sp -; ASM-NEXT: paddb [p6], #-48; mov dc1, dj3 -; ASM-NEXT: lda dj0, [p6, #0]; mov r25, dj3 +; ASM-NEXT: paddb [p6], #-48 +; ASM-NEXT: lda dj0, [p6, #0]; mov p6, sp +; ASM-NEXT: paddb [p6], #-52; mov dc6, dj3 +; ASM-NEXT: lda dj4, [p6, #0] ; ASM-NEXT: mov p6, sp -; ASM-NEXT: paddb [p6], #-52; mov dc2, dj3 -; ASM-NEXT: lda dj4, [p6, #0]; mov p6, sp ; ASM-NEXT: paddb [p6], #-56 ; ASM-NEXT: lda dn0, [p6, #0]; mov p6, sp ; ASM-NEXT: paddb [p6], #-60 ; ASM-NEXT: lda dn4, [p6, #0]; mov p6, sp -; ASM-NEXT: paddb [p6], #-68; mov dc6, dj3 -; ASM-NEXT: lda r10, [p6, #0] +; ASM-NEXT: paddb [p6], #-68 +; ASM-NEXT: lda r10, [p6, #0]; mov p6, sp +; ASM-NEXT: paddb [p6], #-72; mov dc3, dj3 +; ASM-NEXT: lda dj1, [p6, #0] ; ASM-NEXT: mov p6, sp -; ASM-NEXT: paddb [p6], #-72 -; ASM-NEXT: lda dj1, [p6, #0]; mov p6, sp ; ASM-NEXT: paddb [p6], #-76 ; ASM-NEXT: lda r11, [p6, #0]; mov p6, sp ; ASM-NEXT: paddb [p6], #-80 ; ASM-NEXT: lda dn1, [p6, #0]; mov p6, sp -; ASM-NEXT: paddb [p6], #-84; mov dc3, dj3 -; ASM-NEXT: lda r12, [p6, #0] -; ASM-NEXT: mov p6, sp -; ASM-NEXT: paddb [p6], #-132 -; ASM-NEXT: lda m5, [p6, #0]; mov p6, sp -; ASM-NEXT: paddb [p6], #-136 -; ASM-NEXT: lda r28, [p6, #0]; mov p6, sp -; ASM-NEXT: paddb [p6], #-140 -; ASM-NEXT: lda r27, [p6, #0]; mov p6, sp +; ASM-NEXT: paddb [p6], #-84 +; ASM-NEXT: lda r12, [p6, #0]; mov p6, sp ; ASM-NEXT: paddb [p6], #-88; mov dc7, dj3 ; ASM-NEXT: lda r13, [p6, #0] ; ASM-NEXT: mov p6, sp diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll index 24b62e2d76d5..a292a15d449f 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll @@ -67,10 +67,10 @@ define void @mul2d(ptr noalias %in_ptr0, ptr noalias %in_ptr1, ptr noalias %out_ ; CHECK-NEXT: paddb [p3], #-20 ; CHECK-NEXT: lda dn4, [p3, #0]; mov p3, sp ; CHECK-NEXT: mova dc0, #0; paddb [p3], #-24; movx r2, #1 -; CHECK-NEXT: lda m0, [p3, #0]; movx r3, #0 -; CHECK-NEXT: movx r4, #-1; mov dc4, dc0 -; CHECK-NEXT: extend.u8 r5, r5 -; CHECK-NEXT: lshl r1, r1, r4; mov s0, r5 +; CHECK-NEXT: lda m0, [p3, #0]; extend.u8 r5, r5 +; CHECK-NEXT: movx r3, #0; mov s0, r5 +; CHECK-NEXT: movx r4, #-1 +; CHECK-NEXT: lshl r1, r1, r4; mov dc4, dc0 ; CHECK-NEXT: ne r2, r0, r2; vbcst.8 x0, r3 ; CHECK-NEXT: movx r0, #808; mov crSRSSign, r2 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow-3d.mir b/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow-3d.mir index b71fca714ae3..53d67b09d727 100644 --- a/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow-3d.mir +++ b/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow-3d.mir @@ -47,11 +47,9 @@ body: | %7:edc = COPY $r7 %8:ep = COPY $p0 - ; ISel code for: %200(p0), %300(i20), %400(i20) = G_INTRINSIC(add_3d_byte) %8, %0, %1, %2, %3, %4, %5, %6, %7 %100:eds = REG_SEQUENCE %0, %subreg.sub_mod, %1, %subreg.sub_dim_size, %2, %subreg.sub_dim_stride, %3, %subreg.sub_dim_count, %4, %subreg.sub_hi_dim_then_sub_mod, %5, %subreg.sub_hi_dim_then_sub_dim_size, %6, %subreg.sub_hi_dim_then_sub_dim_stride, %7, %subreg.sub_hi_dim_then_sub_dim_count %200:ep, %300:edc, %400:edc = PADDA_3D %8, %100 - ; ISel code for: %201(p0), %301(i20), %401(i20) = G_INTRINSIC(add_3d_byte) %8, %0, %1, %2, %300, %4, %5, %6, %400 %101:eds = REG_SEQUENCE %0, %subreg.sub_mod, %1, %subreg.sub_dim_size, %2, %subreg.sub_dim_stride, %300, %subreg.sub_dim_count, %4, %subreg.sub_hi_dim_then_sub_mod, %5, %subreg.sub_hi_dim_then_sub_dim_size, %6, %subreg.sub_hi_dim_then_sub_dim_stride, %400, %subreg.sub_hi_dim_then_sub_dim_count %201:ep, %301:edc, %401:edc = PADDA_3D %200, %101 @@ -84,38 +82,42 @@ body: | ; CHECK-LABEL: name: test_4_padd_scarce ; CHECK: liveins: $m0, $p0, $p1, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r13, $r14, $r15, $d3_3d ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $m2 = MOV_mv_scl killed $r8 - ; CHECK-NEXT: dead renamable $m6 = KILL killed $r12 - ; CHECK-NEXT: $dn2 = MOV_mv_scl killed $r9 - ; CHECK-NEXT: $dj2 = MOV_mv_scl killed $r10 - ; CHECK-NEXT: $dc2 = MOV_mv_scl killed $r11 - ; CHECK-NEXT: $dn6 = MOV_mv_scl killed $r13 - ; CHECK-NEXT: $dj6 = MOV_mv_scl killed $r14 - ; CHECK-NEXT: $dc6 = MOV_mv_scl killed $r15 - ; CHECK-NEXT: $p2 = MOV_mv_scl $p1 - ; CHECK-NEXT: $p1, dead $dc2, dead $dc6 = PADDA_3D killed $p1, $d2_3d - ; CHECK-NEXT: $m1 = MOV_mv_scl $r0 - ; CHECK-NEXT: $dn1 = MOV_mv_scl $r1 - ; CHECK-NEXT: $dj1 = MOV_mv_scl $r2 - ; CHECK-NEXT: $dc1 = MOV_mv_scl $r3 - ; CHECK-NEXT: $m5 = MOV_mv_scl $r4 - ; CHECK-NEXT: $dn5 = MOV_mv_scl $r5 - ; CHECK-NEXT: $dj5 = MOV_mv_scl $r6 - ; CHECK-NEXT: $dc5 = MOV_mv_scl $r7 + ; CHECK-NEXT: frame-setup PADDB_sp_imm 32, implicit-def $sp, implicit $sp + ; CHECK-NEXT: $m2 = MOV_mv_scl killed $r0 + ; CHECK-NEXT: $m1 = MOV_mv_scl killed $r8 ; CHECK-NEXT: $dn2 = MOV_mv_scl killed $r1 ; CHECK-NEXT: $dj2 = MOV_mv_scl killed $r2 - ; CHECK-NEXT: $m2 = MOV_mv_scl killed $r0 + ; CHECK-NEXT: $dc2 = MOV_mv_scl killed $r3 + ; CHECK-NEXT: $m6 = MOV_mv_scl killed $r4 ; CHECK-NEXT: $dn6 = MOV_mv_scl killed $r5 ; CHECK-NEXT: $dj6 = MOV_mv_scl killed $r6 - ; CHECK-NEXT: $p2, dead $dc1, dead $dc5 = PADDA_3D killed $p2, $d1_3d - ; CHECK-NEXT: $dc2 = MOV_mv_scl killed $r3 - ; CHECK-NEXT: RET implicit $lr ; CHECK-NEXT: $dc6 = MOV_mv_scl killed $r7 - ; CHECK-NEXT: $p0, $dc2, $dc6 = PADDA_3D killed $p0, killed $d2_3d - ; CHECK-NEXT: dead renamable $m6 = KILL killed $r4 - ; CHECK-NEXT: $dc1 = MOV_mv_scl killed $dc2 - ; CHECK-NEXT: $dc5 = MOV_mv_scl killed $dc6 - ; CHECK-NEXT: $p0, dead $dc1, dead $dc5 = PADDA_3D killed $p0, killed $d1_3d + ; CHECK-NEXT: ST_dms_spill killed $m1, -32, implicit $sp :: (store (s32) into %stack.0) + ; CHECK-NEXT: $m1 = MOV_mv_scl $m2 + ; CHECK-NEXT: $dn1 = MOV_mv_scl $dn2 + ; CHECK-NEXT: $dj1 = MOV_mv_scl $dj2 + ; CHECK-NEXT: $dc1 = MOV_mv_scl $dc2 + ; CHECK-NEXT: $m5 = MOV_mv_scl $m6 + ; CHECK-NEXT: $dn5 = MOV_mv_scl $dn6 + ; CHECK-NEXT: $dj5 = MOV_mv_scl $dj6 + ; CHECK-NEXT: $dc5 = MOV_mv_scl $dc6 + ; CHECK-NEXT: $p0, $dc1, $dc5 = PADDA_3D killed $p0, $d1_3d + ; CHECK-NEXT: $m1 = LDA_dms_spill -32, implicit $sp :: (load (s32) from %stack.0) + ; CHECK-NEXT: $p2 = MOV_mv_scl $p1 + ; CHECK-NEXT: $p2, $dc2, $dc6 = PADDA_3D killed $p2, $d2_3d + ; CHECK-NEXT: $dn1 = MOV_mv_scl killed $r9 + ; CHECK-NEXT: $dj1 = MOV_mv_scl killed $r10 + ; CHECK-NEXT: $dn5 = MOV_mv_scl killed $r13 + ; CHECK-NEXT: frame-destroy PADDB_sp_imm -32, implicit-def $sp, implicit $sp + ; CHECK-NEXT: $dj5 = MOV_mv_scl killed $r14 + ; CHECK-NEXT: $dc2 = MOV_mv_scl killed $dc1 + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: $dc6 = MOV_mv_scl killed $dc5 + ; CHECK-NEXT: $dc1 = MOV_mv_scl killed $r11 + ; CHECK-NEXT: $dc5 = MOV_mv_scl killed $r15 + ; CHECK-NEXT: $p0, dead $dc2, dead $dc6 = PADDA_3D killed $p0, killed $d2_3d + ; CHECK-NEXT: $p1, dead $dc1, dead $dc5 = PADDA_3D killed $p1, killed $d1_3d + ; CHECK-NEXT: dead renamable $m5 = KILL killed $r12 ; CHECK-NEXT: DelayedSchedBarrier implicit killed renamable $p0, implicit killed renamable $p1, implicit killed renamable $p2, implicit killed $m0, implicit killed $d3_3d %0:em = COPY $r0 %1:edn = COPY $r1 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir index ddf2c75b1e5e..3e5609195bee 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir @@ -49,20 +49,20 @@ body: | ; CHECK-NEXT: [[COPY18:%[0-9]+]]:acc1024 = COPY [[COPY3]] ; CHECK-NEXT: [[VADD1:%[0-9]+]]:acc1024 = VADD [[COPY6]], [[COPY18]], [[COPY14]] ; CHECK-NEXT: [[COPY19:%[0-9]+]]:acc1024 = COPY [[COPY2]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) ; CHECK-NEXT: [[VADD2:%[0-9]+]]:acc1024 = VADD [[COPY7]], [[COPY19]], [[COPY14]] ; CHECK-NEXT: [[COPY20:%[0-9]+]]:acc1024 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) ; CHECK-NEXT: [[VADD3:%[0-9]+]]:acc1024 = VADD [[COPY8]], [[COPY20]], [[COPY14]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD1]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD2]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD3]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD1]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD2]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: [[COPY7:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD3]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) ; CHECK-NEXT: [[COPY13:%[0-9]+]]:er = ADD_add_r_ri [[COPY13]], -4, implicit-def dead $srcarry ; CHECK-NEXT: PseudoJNZ [[COPY13]], %bb.1 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/conv2d_inner.mir b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/conv2d_inner.mir index 1eee41a450b1..f1e3015a6ddf 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/conv2d_inner.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/conv2d_inner.mir @@ -65,22 +65,22 @@ body: | ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY3]], [[COPY17]], [[COPY33]], [[COPY22]] ; CHECK-NEXT: [[VSHUFFLE:%[0-9]+]]:vec512 = VSHUFFLE [[COPY14]], [[COPY15]], [[COPY21]] ; CHECK-NEXT: [[VSHUFFLE1:%[0-9]+]]:vec512 = VSHUFFLE [[VSHUFFLE]], [[COPY9]], [[COPY21]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY1]], [[COPY16]], [[COPY32]], [[COPY22]] ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY4]], [[VSHUFFLE1]], [[COPY32]], [[COPY22]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY5]], [[VSHUFFLE1]], [[COPY33]], [[COPY22]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY1]], [[COPY16]], [[COPY32]], [[COPY22]] ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY6]], [[VSHUFFLE]], [[COPY32]], [[COPY22]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vec512 = COPY [[COPY29]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY5]], [[VSHUFFLE1]], [[COPY33]], [[COPY22]] - ; CHECK-NEXT: undef [[COPY29:%[0-9]+]].sub_256_lo:vec512, [[COPY25:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY25]], 32 + ; CHECK-NEXT: [[COPY34:%[0-9]+]]:er = COPY [[COPY19]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY14]], [[COPY31]], [[COPY18]], [[COPY34]] ; CHECK-NEXT: [[COPY7:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY7]], [[COPY16]], [[COPY33]], [[COPY22]] + ; CHECK-NEXT: undef [[COPY29:%[0-9]+]].sub_256_lo:vec512, [[COPY25:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY25]], 32 ; CHECK-NEXT: [[COPY29:%[0-9]+]].sub_256_hi:vec512, [[COPY25:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY25]], 32 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY8]], [[VSHUFFLE]], [[COPY33]], [[COPY22]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vec512 = COPY [[COPY30]] ; CHECK-NEXT: undef [[COPY30:%[0-9]+]].sub_256_lo:vec512, [[COPY25:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY25]], 32 - ; CHECK-NEXT: [[COPY34:%[0-9]+]]:er = COPY [[COPY19]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY14]], [[COPY31]], [[COPY18]], [[COPY34]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY15]], [[COPY31]], [[COPY28]], [[COPY34]] ; CHECK-NEXT: undef [[COPY18:%[0-9]+]].sub_256_lo:vec512, [[COPY26:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm [[COPY26]], [[COPY]] ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub_256_hi:vec512, [[COPY26:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm [[COPY26]], [[COPY]] - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY15]], [[COPY31]], [[COPY28]], [[COPY34]] ; CHECK-NEXT: undef [[COPY28:%[0-9]+]].sub_256_lo:vec512, [[COPY26:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm [[COPY26]], [[COPY]] ; CHECK-NEXT: [[COPY28:%[0-9]+]].sub_256_hi:vec512, [[COPY26:%[0-9]+]]:ep_as_32bit, [[COPY27:%[0-9]+]].sub_dim_count:eds, [[COPY27:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_dmw_lda_w [[COPY26]], [[COPY27]] ; CHECK-NEXT: [[COPY30:%[0-9]+]].sub_256_hi:vec512, [[COPY25:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY25]], 32 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/reduce_pressure.mir b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/reduce_pressure.mir index 4910343c9f68..bf953955efc8 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/reduce_pressure.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/reduce_pressure.mir @@ -34,34 +34,34 @@ body: | ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vec512 = COPY $x0 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vec512 = COPY $x0 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc1024 = COPY $cm0 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vec512 = COPY $x0 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:er = COPY $r0 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = COPY $p0 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:mss = COPY $s0 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vec512 = COPY $x0 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:er = COPY $r0 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:ep_as_32bit = COPY $p0 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:eds = COPY $d0_3d - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:acc1024 = COPY $cm0 - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:er = COPY $r0 - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:mss = COPY $s0 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:ep_as_32bit = COPY $p0 + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:eds = COPY $d0_3d + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:er = COPY $r0 ; CHECK-NEXT: PseudoJ_jump_imm %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vec512 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY8]], [[COPY4]], [[COPY17]], [[COPY10]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY14]], [[COPY5]], [[COPY17]], [[COPY10]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = COPY [[COPY9]] - ; CHECK-NEXT: undef [[COPY9:%[0-9]+]].sub_256_lo:vec512, [[COPY11:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY11]], 32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]].sub_256_hi:vec512, [[COPY11:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY11]], 32 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY2]], [[COPY16]], [[COPY6]], [[COPY10]] - ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub_256_lo:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY12]], 32 - ; CHECK-NEXT: [[COPY6:%[0-9]+]].sub_256_hi:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY12]], 32 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY3]], [[COPY16]], [[COPY7]], [[COPY10]] - ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub_256_lo:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY12]], 32 - ; CHECK-NEXT: [[COPY7:%[0-9]+]].sub_256_hi:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit, [[COPY13:%[0-9]+]].sub_dim_count:eds, [[COPY13:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_dmw_lda_w [[COPY12]], [[COPY13]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vec512 = VSHUFFLE [[COPY2]], [[COPY3]], [[COPY10]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vec512 = VSHUFFLE [[COPY4]], [[COPY]], [[COPY10]] - ; CHECK-NEXT: PseudoJNZ [[COPY15]], %bb.1 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY8]], [[COPY4]], [[COPY17]], [[COPY11]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY15]], [[COPY5]], [[COPY17]], [[COPY11]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = COPY [[COPY10]] + ; CHECK-NEXT: undef [[COPY10:%[0-9]+]].sub_256_lo:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY12]], 32 + ; CHECK-NEXT: [[COPY10:%[0-9]+]].sub_256_hi:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY12]], 32 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY2]], [[COPY9]], [[COPY6]], [[COPY11]] + ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub_256_lo:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY13]], 32 + ; CHECK-NEXT: [[COPY6:%[0-9]+]].sub_256_hi:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY13]], 32 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY3]], [[COPY9]], [[COPY7]], [[COPY11]] + ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub_256_lo:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY13]], 32 + ; CHECK-NEXT: [[COPY7:%[0-9]+]].sub_256_hi:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit, [[COPY14:%[0-9]+]].sub_dim_count:eds, [[COPY14:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_dmw_lda_w [[COPY13]], [[COPY14]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vec512 = VSHUFFLE [[COPY2]], [[COPY3]], [[COPY11]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vec512 = VSHUFFLE [[COPY4]], [[COPY]], [[COPY11]] + ; CHECK-NEXT: PseudoJNZ [[COPY16]], %bb.1 ; CHECK-NEXT: PseudoJ_jump_imm %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -126,17 +126,17 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = COPY $x0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec1024 = COPY $y2 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec1024 = COPY $y2 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024 = COPY $cm0 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:er = COPY $r0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:ep_as_32bit = COPY $p0 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:ep_as_32bit = COPY $p0 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:eds = COPY $d0_3d - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:acc1024 = COPY $cm0 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:er = COPY $r0 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vec512 = COPY $x0 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vec512 = COPY $x0 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vec1024 = COPY $y2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:eds = COPY $d0_3d + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc1024 = COPY $cm0 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vec512 = COPY $x0 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vec512 = COPY $x0 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vec1024 = COPY $y2 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vec1024 = COPY $y2 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:ep_as_32bit = COPY $p0 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:ep_as_32bit = COPY $p0 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vec512 = COPY $x0 ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vec512 = COPY $x0 ; CHECK-NEXT: [[COPY16:%[0-9]+]]:vec512 = COPY $x0 @@ -148,21 +148,21 @@ body: | ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY20:%[0-9]+]]:vec512 = COPY [[COPY11]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY4]], [[COPY14]], [[COPY20]], [[COPY5]], implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY13]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY12]], [[COPY19]], [[COPY16]], [[COPY5]] - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY9]], [[COPY15]], [[COPY20]], [[COPY5]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vec512 = COPY [[COPY18]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY1]], [[COPY19]], [[COPY17]], [[COPY5]] - ; CHECK-NEXT: undef [[COPY18:%[0-9]+]].sub_256_lo:vec512, [[COPY6:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY6]], 32 - ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub_256_hi:vec512, [[COPY6:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY6]], 32 - ; CHECK-NEXT: undef [[COPY16:%[0-9]+]].sub_256_lo:vec512, [[COPY7:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY7]], 32 - ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub_256_hi:vec512, [[COPY7:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY7]], 32 - ; CHECK-NEXT: undef [[COPY17:%[0-9]+]].sub_256_lo:vec512, [[COPY7:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY7]], 32 - ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub_256_hi:vec512, [[COPY7:%[0-9]+]]:ep_as_32bit, [[COPY8:%[0-9]+]].sub_dim_count:eds, [[COPY8:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_dmw_lda_w [[COPY7]], [[COPY8]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vec512 = VSHUFFLE [[COPY12]], [[COPY1]], [[COPY5]] - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vec512 = VSHUFFLE [[COPY14]], [[COPY]], [[COPY5]] - ; CHECK-NEXT: PseudoJNZ [[COPY10]], %bb.1 + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:vec512 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY3]], [[COPY14]], [[COPY20]], [[COPY4]], implicit [[COPY2]], implicit [[COPY10]], implicit [[COPY11]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY8]], [[COPY19]], [[COPY16]], [[COPY4]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc1024 = VMAC_vmac_cm_core_dense [[COPY6]], [[COPY15]], [[COPY20]], [[COPY4]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = COPY [[COPY18]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vec512 = VSHIFT_ALIGN [[COPY9]], [[COPY19]], [[COPY17]], [[COPY4]] + ; CHECK-NEXT: undef [[COPY18:%[0-9]+]].sub_256_lo:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY12]], 32 + ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub_256_hi:vec512, [[COPY12:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY12]], 32 + ; CHECK-NEXT: undef [[COPY16:%[0-9]+]].sub_256_lo:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY13]], 32 + ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub_256_hi:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY13]], 32 + ; CHECK-NEXT: undef [[COPY17:%[0-9]+]].sub_256_lo:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[COPY13]], 32 + ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub_256_hi:vec512, [[COPY13:%[0-9]+]]:ep_as_32bit, [[COPY5:%[0-9]+]].sub_dim_count:eds, [[COPY5:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_dmw_lda_w [[COPY13]], [[COPY5]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vec512 = VSHUFFLE [[COPY8]], [[COPY9]], [[COPY4]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vec512 = VSHUFFLE [[COPY14]], [[COPY]], [[COPY4]] + ; CHECK-NEXT: PseudoJNZ [[COPY7]], %bb.1 ; CHECK-NEXT: PseudoJ_jump_imm %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: From 42e32cc4d9d9abe12335fd62594e6278812b09a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Tue, 2 Jul 2024 13:40:55 +0100 Subject: [PATCH 05/31] [AIEX] NFC: Allow building AIEHazardRecognized with a fixed depth Useful for unit-testing where we might not have itineraries from which to derive the depth. --- llvm/lib/Target/AIE/AIEHazardRecognizer.cpp | 17 ++++++++++++----- llvm/lib/Target/AIE/AIEHazardRecognizer.h | 6 +++++- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp index 51f31f17ef48..bbcc9a2e3a83 100644 --- a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp +++ b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp @@ -161,13 +161,20 @@ static cl::opt int AIEHazardRecognizer::NumInstrsScheduled = 0; -AIEHazardRecognizer::AIEHazardRecognizer(const AIEBaseInstrInfo *TII, - const InstrItineraryData *II, - bool IsPreRA) +AIEHazardRecognizer::AIEHazardRecognizer( + const AIEBaseInstrInfo *TII, const InstrItineraryData *II, bool IsPreRA, + std::optional ScoreboardDepth) : TII(TII), ItinData(II) { - computeMaxLatency(); - int Depth = computeScoreboardDepth(); + int Depth = 0; + if (ScoreboardDepth.has_value()) { + MaxLatency = *ScoreboardDepth; + Depth = *ScoreboardDepth; + } else { + computeMaxLatency(); + Depth = computeScoreboardDepth(); + } + Scoreboard.reset(Depth); MaxLookAhead = Depth; if (CLIssueLimit > 0) diff --git a/llvm/lib/Target/AIE/AIEHazardRecognizer.h b/llvm/lib/Target/AIE/AIEHazardRecognizer.h index 4b1916c2dcee..2464531ef771 100644 --- a/llvm/lib/Target/AIE/AIEHazardRecognizer.h +++ b/llvm/lib/Target/AIE/AIEHazardRecognizer.h @@ -92,8 +92,12 @@ class AIEHazardRecognizer : public ScheduleHazardRecognizer { void computeMaxLatency(); public: + /// ScoreboardDepth can be used to speficy a fixed depth without querying the + /// scheduling model. This is mostly used for testing, for other cases we + /// should trust the instruction itineraries. AIEHazardRecognizer(const AIEBaseInstrInfo *TII, const InstrItineraryData *II, - bool IsPreRA); + bool IsPreRA, + std::optional ScoreboardDepth = std::nullopt); AIEHazardRecognizer(const TargetSubtargetInfo &SubTarget, bool IsPreRA = false); From cd556d59557be6e5b30f5ca98c84261e57a49ca2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Tue, 2 Jul 2024 13:44:17 +0100 Subject: [PATCH 06/31] unittests/CodeGen: Extend ScheduleDAGMITestUtils to use SchedBoundary This gets the "fake" scheduler closer to the MachineScheduler. The scheduled instructions are "bumped" into the SchedBoundary, and that can trigger events like HazardRec->emitInstr(). --- .../CodeGen/ScheduleDAGMITestUtils.cpp | 28 ++++++-- .../CodeGen/ScheduleDAGMITestUtils.h | 19 +++++- .../Target/AIE/AIEScheduleDAGMITest.cpp | 64 +++++++++++++++++++ llvm/unittests/Target/AIE/CMakeLists.txt | 3 + 4 files changed, 107 insertions(+), 7 deletions(-) create mode 100644 llvm/unittests/Target/AIE/AIEScheduleDAGMITest.cpp diff --git a/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.cpp b/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.cpp index 53b6deaa3825..c9a3a544b8ce 100644 --- a/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.cpp +++ b/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.cpp @@ -26,8 +26,20 @@ MCInstrDesc MCIDs[] = { {TargetOpcode::DBG_VALUE, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {TargetOpcode::GENERIC_OP_END + 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}}; +std::unique_ptr getSchedStrategy(MachineSchedContext *C, + bool IsPreRA) { + if (IsPreRA) + return std::make_unique(C); + return std::make_unique(C); +} + } // namespace +DummyScheduleDAGMI::DummyScheduleDAGMI(MachineSchedContext *C, bool IsPreRA) + : ScheduleDAGMI(C, getSchedStrategy(C, IsPreRA), + /*RemoveKillFlags=*/true), + IsPreRA(IsPreRA), SchedZone(SchedBoundary::BotQID, "Zone") {} + DummyScheduleDAGMI::~DummyScheduleDAGMI() { if (BB) { exitRegion(); @@ -44,6 +56,8 @@ void DummyScheduleDAGMI::prepareForBB(MachineBasicBlock *MBB) { SchedImpl->initialize(this); SmallVector TopRoots, BotRoots; initQueues(TopRoots, BotRoots); + + SchedZone.init(this, SchedImpl.get(), getSchedModel(), /*Rem=*/nullptr); } void DummyScheduleDAGMI::scheduleInstr(MachineInstr *MI, bool IsTop, @@ -52,16 +66,22 @@ void DummyScheduleDAGMI::scheduleInstr(MachineInstr *MI, bool IsTop, movePickedSU(*SU, IsTop, EmissionCycle); } +void DummyScheduleDAGMI::scheduleInstr(MachineInstr *MI, SchedBoundary &Zone) { + scheduleInstr(MI, Zone.isTop()); + unsigned &ReadyCycle = + Zone.isTop() ? getSUnit(MI)->TopReadyCycle : getSUnit(MI)->BotReadyCycle; + ReadyCycle = std::max(ReadyCycle, Zone.getCurrCycle()); + Zone.bumpNode(getSUnit(MI)); +} + ScheduleDAGMITest::ScheduleDAGMITest() : Mod("Module", Ctx) { MF = createMachineFunction(Ctx, Mod); MBB = MF->CreateMachineBasicBlock(); } -void ScheduleDAGMITest::initializeScheduler() { +void ScheduleDAGMITest::initializeScheduler(bool IsPreRA) { SchedCtx.MF = MF.get(); - Scheduler = std::make_unique( - &SchedCtx, std::make_unique(&SchedCtx), - /*RemoveKillFlags=*/true); + Scheduler = std::make_unique(&SchedCtx, IsPreRA); Scheduler->prepareForBB(MBB); } diff --git a/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.h b/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.h index 45236a0718a8..8700c6b59df1 100644 --- a/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.h +++ b/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.h @@ -18,16 +18,29 @@ namespace llvm { class DummyScheduleDAGMI : public ScheduleDAGMI { public: - using ScheduleDAGMI::ScheduleDAGMI; + DummyScheduleDAGMI(MachineSchedContext *C, bool IsPreRA); ~DummyScheduleDAGMI() override; /// Initialize enough stuff in a similar manner to ScheduleDAGMI::schedule() /// so one can do "manual" scheduling. void prepareForBB(MachineBasicBlock *MBB); - /// Move \p MI to the Top or Bot Zone + /// Move \p MI to the top or bottom of the scheduling region. void scheduleInstr(MachineInstr *MI, bool IsTop, std::optional EmissionCycle = std::nullopt); + + /// Move \p MI to Zone and update its ReadyCycle + /// This essentially mimics the body of the scheduling loop inside + /// ScheduleDAGMI::schedule(). + void scheduleInstr(MachineInstr *MI, SchedBoundary &Zone); + + SchedBoundary &getSchedZone() { return SchedZone; } + + bool hasVRegLiveness() const override { return IsPreRA; } + +protected: + bool IsPreRA; + SchedBoundary SchedZone; }; class ScheduleDAGMITest : public testing::Test { @@ -36,7 +49,7 @@ class ScheduleDAGMITest : public testing::Test { /// Initialize a DummyScheduleDAGMI so it is ready to schedule instructions /// in \p MBB - void initializeScheduler(); + virtual void initializeScheduler(bool IsPreRA = false); /// Create a dummy instruction for which MachineInstr::isDebugValue() is true /// It is pushed at the end of \p MBB diff --git a/llvm/unittests/Target/AIE/AIEScheduleDAGMITest.cpp b/llvm/unittests/Target/AIE/AIEScheduleDAGMITest.cpp new file mode 100644 index 000000000000..0e18373b5252 --- /dev/null +++ b/llvm/unittests/Target/AIE/AIEScheduleDAGMITest.cpp @@ -0,0 +1,64 @@ +//===- AIEScheduleDAGMITest.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +#include "AIEHazardRecognizer.h" +#include "AIEMachineScheduler.h" +#include "ScheduleDAGMITestUtils.h" + +using namespace llvm; + +namespace { + +class DummyAIEHazardRecognizer : public AIEHazardRecognizer { +public: + DummyAIEHazardRecognizer(const ScheduleDAG *DAG) + : AIEHazardRecognizer(nullptr, nullptr, DAG, /*ScoreboardDepth=*/0) {} + + ~DummyAIEHazardRecognizer() override {} + + HazardType getHazardType(SUnit *SU, int DeltaCycles) override { + return NoHazard; + } +}; + +/// A simple wrapper around ScheduleDAGMITest that initializes a scheduler +/// and a scheduling zone (SchedBoundary) with a DummyAIEHazardRecognizer. +class AIEScheduleDAGMITest : public ScheduleDAGMITest { +protected: + void initializeScheduler(bool IsPreRA = false) override { + ScheduleDAGMITest::initializeScheduler(IsPreRA); + Scheduler->getSchedZone().HazardRec = + new DummyAIEHazardRecognizer(Scheduler.get()); + } +}; + +/// Case where all instructions are scheduled in Bot.getCurrCycle() +TEST_F(AIEScheduleDAGMITest, SchedNoDelta) { + auto *MI0 = appendPlainInstr(); + auto *MI1 = appendPlainInstr(); + auto *MI2 = appendPlainInstr(); + + initializeScheduler(); + SchedBoundary &Bot = Scheduler->getSchedZone(); + + // Mark all instructions as available. + for (auto *MI : {MI0, MI1, MI2}) + Bot.releaseNode(Scheduler->getSUnit(MI), /*ReadyCycle=*/0, false); + + Scheduler->scheduleInstr(MI2, Bot); + Bot.bumpCycle(2); + Scheduler->scheduleInstr(MI0, Bot); + Bot.bumpCycle(3); + Scheduler->scheduleInstr(MI1, Bot); + + EXPECT_EQ(MISeq(*MBB), MISeq({MI1, MI0, MI2})); +} + +} // end namespace diff --git a/llvm/unittests/Target/AIE/CMakeLists.txt b/llvm/unittests/Target/AIE/CMakeLists.txt index fa0ace57aaa2..13e56014dc1d 100644 --- a/llvm/unittests/Target/AIE/CMakeLists.txt +++ b/llvm/unittests/Target/AIE/CMakeLists.txt @@ -6,6 +6,7 @@ # (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates include_directories( ${LLVM_MAIN_SRC_DIR}/lib/Target/AIE + ${LLVM_MAIN_SRC_DIR}/unittests/CodeGen ${LLVM_BINARY_DIR}/lib/Target/AIE ) @@ -24,6 +25,8 @@ set(LLVM_LINK_COMPONENTS ) add_llvm_target_unittest(AIETests + ${LLVM_MAIN_SRC_DIR}/unittests/CodeGen/ScheduleDAGMITestUtils.cpp + AIEScheduleDAGMITest.cpp BundleTest.cpp HazardRecognizerTest.cpp ) From a7adce865ea30b2c7a01e6861e541cdedfaec54b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Wed, 3 Jul 2024 12:47:03 +0100 Subject: [PATCH 07/31] unittests/CodeGen: More flexible creation of test targets. --- llvm/unittests/CodeGen/MFCommon.inc | 46 +++++++++++++++++++---------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc index 1997e8052297..b411d12f6848 100644 --- a/llvm/unittests/CodeGen/MFCommon.inc +++ b/llvm/unittests/CodeGen/MFCommon.inc @@ -1,3 +1,14 @@ +//===- MFCommon.inc - Helpers for unit-testing the backend ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its +// affiliates +// +//===----------------------------------------------------------------------===// + // Add a few Bogus backend classes so we can create MachineInstrs without // depending on a real target. class BogusTargetLowering : public TargetLowering { @@ -73,13 +84,15 @@ public: } }; -class BogusSubtarget : public TargetSubtargetInfo { +template class TestSubTarget : public TargetSubtargetInfo { public: - BogusSubtarget(TargetMachine &TM) - : TargetSubtargetInfo(Triple(""), "", "", "", {}, {}, nullptr, nullptr, - nullptr, nullptr, nullptr, nullptr), + TestSubTarget(TargetMachine &TM, StringRef CPU = "", StringRef TuneCPU = "", + ArrayRef ProcDescs = {}) + : TargetSubtargetInfo(Triple(""), CPU, TuneCPU, "", {}, ProcDescs, + nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr), FL(), TL(TM) {} - ~BogusSubtarget() override {} + ~TestSubTarget() override {} const TargetFrameLowering *getFrameLowering() const override { return &FL; } @@ -93,8 +106,9 @@ private: BogusFrameLowering FL; BogusRegisterInfo TRI; BogusTargetLowering TL; - TargetInstrInfo TII; + TIIClass TII; }; +using BogusSubTarget = TestSubTarget; static TargetOptions getTargetOptionsForBogusMachine() { TargetOptions Opts; @@ -102,39 +116,41 @@ static TargetOptions getTargetOptionsForBogusMachine() { return Opts; } -class BogusTargetMachine : public LLVMTargetMachine { +template class TestTargetMachine : public LLVMTargetMachine { public: - BogusTargetMachine() + TestTargetMachine() : LLVMTargetMachine(Target(), "", Triple(""), "", "", getTargetOptionsForBogusMachine(), Reloc::Static, CodeModel::Small, CodeGenOptLevel::Default), ST(*this) {} - ~BogusTargetMachine() override {} + ~TestTargetMachine() override {} const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override { return &ST; } private: - BogusSubtarget ST; + STClass ST; }; +using BogusTargetMachine = TestTargetMachine; -BogusTargetMachine *createTargetMachine() { +[[maybe_unused]] BogusTargetMachine *createTargetMachine() { static BogusTargetMachine BogusTM; return &BogusTM; } -std::unique_ptr createMachineFunction(LLVMContext &Ctx, - Module &M) { +[[maybe_unused]] std::unique_ptr +createMachineFunction(LLVMContext &Ctx, Module &M, + LLVMTargetMachine *TM = nullptr) { auto Type = FunctionType::get(Type::getVoidTy(Ctx), false); auto F = Function::Create(Type, GlobalValue::ExternalLinkage, "Test", &M); - auto TM = createTargetMachine(); + if (!TM) + TM = createTargetMachine(); unsigned FunctionNum = 42; MachineModuleInfo MMI(TM); const TargetSubtargetInfo &STI = *TM->getSubtargetImpl(*F); return std::make_unique(*F, *TM, STI, FunctionNum, MMI); } - From dd08ca24217654115fd0d68e30a404acf5c00516 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Wed, 3 Jul 2024 13:31:47 +0100 Subject: [PATCH 08/31] unittests/CodeGen: support DeltaCycles in ScheduleDAGMITestUtils This only nicely works for out-of-order targets, so an extra parameter can be passed to ScheduleDAGMITest to use a custom TargetMachine instead of BogusTargetMachine. --- .../CodeGen/ScheduleDAGMITestUtils.cpp | 18 ++++++++++++------ .../unittests/CodeGen/ScheduleDAGMITestUtils.h | 4 ++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.cpp b/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.cpp index c9a3a544b8ce..2a5587eee735 100644 --- a/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.cpp +++ b/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.cpp @@ -66,16 +66,22 @@ void DummyScheduleDAGMI::scheduleInstr(MachineInstr *MI, bool IsTop, movePickedSU(*SU, IsTop, EmissionCycle); } -void DummyScheduleDAGMI::scheduleInstr(MachineInstr *MI, SchedBoundary &Zone) { - scheduleInstr(MI, Zone.isTop()); +void DummyScheduleDAGMI::scheduleInstr(MachineInstr *MI, SchedBoundary &Zone, + int Delta) { + // Move the instruction nat the right place in MBB + unsigned EmitCycle = int(Zone.getCurrCycle()) - Delta; + scheduleInstr(MI, Zone.isTop(), EmitCycle); + + // Mimic MachineSchedStrategy::schedNode() unsigned &ReadyCycle = Zone.isTop() ? getSUnit(MI)->TopReadyCycle : getSUnit(MI)->BotReadyCycle; - ReadyCycle = std::max(ReadyCycle, Zone.getCurrCycle()); - Zone.bumpNode(getSUnit(MI)); + ReadyCycle = std::max(ReadyCycle, EmitCycle); + Zone.bumpNode(getSUnit(MI), Delta); } -ScheduleDAGMITest::ScheduleDAGMITest() : Mod("Module", Ctx) { - MF = createMachineFunction(Ctx, Mod); +ScheduleDAGMITest::ScheduleDAGMITest(LLVMTargetMachine *TM) + : Mod("Module", Ctx) { + MF = createMachineFunction(Ctx, Mod, TM); MBB = MF->CreateMachineBasicBlock(); } diff --git a/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.h b/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.h index 8700c6b59df1..6e6eb13096e7 100644 --- a/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.h +++ b/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.h @@ -32,7 +32,7 @@ class DummyScheduleDAGMI : public ScheduleDAGMI { /// Move \p MI to Zone and update its ReadyCycle /// This essentially mimics the body of the scheduling loop inside /// ScheduleDAGMI::schedule(). - void scheduleInstr(MachineInstr *MI, SchedBoundary &Zone); + void scheduleInstr(MachineInstr *MI, SchedBoundary &Zone, int Delta = 0); SchedBoundary &getSchedZone() { return SchedZone; } @@ -45,7 +45,7 @@ class DummyScheduleDAGMI : public ScheduleDAGMI { class ScheduleDAGMITest : public testing::Test { protected: - ScheduleDAGMITest(); + ScheduleDAGMITest(LLVMTargetMachine *TM = nullptr); /// Initialize a DummyScheduleDAGMI so it is ready to schedule instructions /// in \p MBB From d93d8da28e56c91025dc7bba588303c84fab135b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Wed, 3 Jul 2024 13:41:29 +0100 Subject: [PATCH 09/31] [AIEX] Define AIETest target for scheduling unit-tests And use it to verify scheduling with DeltaCycles. We had unit tests for the MI insertion logic in ScheduleDAGMITestUtils.cpp, but it really just verifies movePickedSU(). Here we test a larger chunk of the scheduler. --- .../Target/AIE/AIEScheduleDAGMITest.cpp | 27 +++++++++ llvm/unittests/Target/AIE/AIETestTarget.cpp | 59 +++++++++++++++++++ llvm/unittests/Target/AIE/AIETestTarget.h | 18 ++++++ llvm/unittests/Target/AIE/CMakeLists.txt | 1 + 4 files changed, 105 insertions(+) create mode 100644 llvm/unittests/Target/AIE/AIETestTarget.cpp create mode 100644 llvm/unittests/Target/AIE/AIETestTarget.h diff --git a/llvm/unittests/Target/AIE/AIEScheduleDAGMITest.cpp b/llvm/unittests/Target/AIE/AIEScheduleDAGMITest.cpp index 0e18373b5252..cacd0cce4dea 100644 --- a/llvm/unittests/Target/AIE/AIEScheduleDAGMITest.cpp +++ b/llvm/unittests/Target/AIE/AIEScheduleDAGMITest.cpp @@ -10,6 +10,7 @@ #include "AIEHazardRecognizer.h" #include "AIEMachineScheduler.h" +#include "AIETestTarget.h" #include "ScheduleDAGMITestUtils.h" using namespace llvm; @@ -31,6 +32,10 @@ class DummyAIEHazardRecognizer : public AIEHazardRecognizer { /// A simple wrapper around ScheduleDAGMITest that initializes a scheduler /// and a scheduling zone (SchedBoundary) with a DummyAIEHazardRecognizer. class AIEScheduleDAGMITest : public ScheduleDAGMITest { +public: + AIEScheduleDAGMITest() + : ScheduleDAGMITest(AIE::createAIETestTargetMachine()) {} + protected: void initializeScheduler(bool IsPreRA = false) override { ScheduleDAGMITest::initializeScheduler(IsPreRA); @@ -61,4 +66,26 @@ TEST_F(AIEScheduleDAGMITest, SchedNoDelta) { EXPECT_EQ(MISeq(*MBB), MISeq({MI1, MI0, MI2})); } +/// Case where instructions are scheduled with a delta from Bot.getCurrCycle(). +TEST_F(AIEScheduleDAGMITest, SchedWithDelta) { + auto *MI0 = appendPlainInstr(); + auto *MI1 = appendPlainInstr(); + auto *MI2 = appendPlainInstr(); + + initializeScheduler(); + SchedBoundary &Bot = Scheduler->getSchedZone(); + + // Mark all instructions as available. + for (auto *MI : {MI0, MI1, MI2}) + Bot.releaseNode(Scheduler->getSUnit(MI), /*ReadyCycle=*/0, false); + + Scheduler->scheduleInstr(MI2, Bot, -8); // Emit in cycle 0+8 + Bot.bumpCycle(2); + Scheduler->scheduleInstr(MI1, Bot, -5); // Emit in cycle 2+5 + Bot.bumpCycle(3); + Scheduler->scheduleInstr(MI0, Bot); // Emit in cycle 3 + + EXPECT_EQ(MISeq(*MBB), MISeq({MI2, MI1, MI0})); +} + } // end namespace diff --git a/llvm/unittests/Target/AIE/AIETestTarget.cpp b/llvm/unittests/Target/AIE/AIETestTarget.cpp new file mode 100644 index 000000000000..4e37b1eb786d --- /dev/null +++ b/llvm/unittests/Target/AIE/AIETestTarget.cpp @@ -0,0 +1,59 @@ +//===- AIETestTarget.cpp --------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +#include "AIETestTarget.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/MC/TargetRegistry.h" + +using namespace llvm; + +namespace { + +// Include helper functions to define a testing target. +#include "MFCommon.inc" + +MCSchedModel AIETestSchedModel = { + 1000, // IssueWidth + 1000, // MicroOpBufferSize + MCSchedModel::DefaultLoopMicroOpBufferSize, + MCSchedModel::DefaultLoadLatency, + MCSchedModel::DefaultHighLatency, + MCSchedModel::DefaultMispredictPenalty, + true, // PostRAScheduler + true, // CompleteModel + false, // EnableIntervals + 0, // Processor ID + nullptr, // No resources + nullptr, // No sched classes + 0, // No resources + 0, // No sched classes + nullptr, // No Itinerary + nullptr // No extra processor descriptor +}; + +SubtargetSubTypeKV ProcModels[] = {SubtargetSubTypeKV{ + "aie-test", FeatureBitArray({}), FeatureBitArray({}), &AIETestSchedModel}}; + +} // namespace + +class AIETestSubTarget : public TestSubTarget { +public: + AIETestSubTarget(TargetMachine &TM) + : TestSubTarget(TM, "aie-test", "aie-test", ProcModels) { + } +}; + +LLVMTargetMachine *llvm::AIE::createAIETestTargetMachine() { + static TestTargetMachine AIETM; + return &AIETM; +} diff --git a/llvm/unittests/Target/AIE/AIETestTarget.h b/llvm/unittests/Target/AIE/AIETestTarget.h new file mode 100644 index 000000000000..2298509ef892 --- /dev/null +++ b/llvm/unittests/Target/AIE/AIETestTarget.h @@ -0,0 +1,18 @@ +//===- AIETestTarget.h ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +#include "llvm/Target/TargetMachine.h" + +namespace llvm::AIE { + +/// Create an AIE-like target that can be used for unit-tests. +LLVMTargetMachine *createAIETestTargetMachine(); + +} // namespace llvm::AIE diff --git a/llvm/unittests/Target/AIE/CMakeLists.txt b/llvm/unittests/Target/AIE/CMakeLists.txt index 13e56014dc1d..2e2fe70c467d 100644 --- a/llvm/unittests/Target/AIE/CMakeLists.txt +++ b/llvm/unittests/Target/AIE/CMakeLists.txt @@ -27,6 +27,7 @@ set(LLVM_LINK_COMPONENTS add_llvm_target_unittest(AIETests ${LLVM_MAIN_SRC_DIR}/unittests/CodeGen/ScheduleDAGMITestUtils.cpp AIEScheduleDAGMITest.cpp + AIETestTarget.cpp BundleTest.cpp HazardRecognizerTest.cpp ) From 9d9433dae9c19e112f838a447b792b2b7a69edaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Thu, 4 Jul 2024 09:49:44 +0100 Subject: [PATCH 10/31] [AIEX] NFC: Make AIEBaseInstrInfo default constructible This can then serve as a version-agnostic base in unit tests. --- llvm/lib/Target/AIE/AIEBaseInstrInfo.h | 49 +++++++++++++++++++------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h index 7632602f596f..df7bac927756 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h +++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h @@ -29,42 +29,61 @@ namespace llvm { struct AIEBaseInstrInfo : public TargetInstrInfo { using TargetInstrInfo::TargetInstrInfo; - /// Return the opcode for a return instruction - virtual unsigned getReturnOpcode() const = 0; + /// Return the opcode for a return instruction + virtual unsigned getReturnOpcode() const { + llvm_unreachable("Target didn't implement getReturnOpcode"); + } /// Return the opcode for a call instruction /// \param CallerF The function that makes the call /// \param IsIndirect Select function pointer call or direct call /// \param Select a tail call variant. virtual unsigned getCallOpcode(const MachineFunction &CallerF, - bool IsIndirect, bool IsTailCall) const = 0; + bool IsIndirect, bool IsTailCall) const { + llvm_unreachable("Target didn't implement getCallOpcode"); + } + /// Return the kind of slot that this instruction can be executed in. /// This is used to direct the packetization of simple instructions. /// NOTE: If this is called on a Composite Instruction (i.e. an instruction /// defining a Packet format, owning possibly multiples slots), the returned /// slot will be the default one (unknown). - virtual MCSlotKind getSlotKind(unsigned Opcode) const = 0; - virtual const MCSlotInfo *getSlotInfo(const MCSlotKind Kind) const = 0; + virtual MCSlotKind getSlotKind(unsigned Opcode) const { + llvm_unreachable("Target didn't implement getSlotKind"); + } + virtual const MCSlotInfo *getSlotInfo(const MCSlotKind Kind) const { + llvm_unreachable("Target didn't implement getSlotInfo"); + } /// Return the Packet formats for this target - virtual const PacketFormats &getPacketFormats() const = 0; + virtual const PacketFormats &getPacketFormats() const { + llvm_unreachable("Target didn't implement getPacketFormats"); + } /// Return a nop of the given byte size, or the smallest if zero. - virtual unsigned getNopOpcode(size_t Size = 0) const = 0; + virtual unsigned getNopOpcode(size_t Size = 0) const { + llvm_unreachable("Target didn't implement getNopOpcode"); + } /// Return an opcode that reverses the branch condition of a given /// instruction /// \param Opc Opcode of the branch to reverse /// \pre Opc must be a conditional branch - virtual unsigned getOppositeBranchOpcode(unsigned Opc) const = 0; + virtual unsigned getOppositeBranchOpcode(unsigned Opc) const { + llvm_unreachable("Target didn't implement getOppositeBranchOpcode"); + } /// Return the opcode of an unconditional jump - virtual unsigned getJumpOpcode() const = 0; + virtual unsigned getJumpOpcode() const { + llvm_unreachable("Target didn't implement getJumpOpcode"); + } /// Return Multi-Slot Pseudo opcode based on Reg type and imm. size virtual unsigned getConstantMovOpcode(MachineRegisterInfo &MRI, unsigned int Reg, APInt &Val) const { - return -1; + llvm_unreachable("Target didn't implement getConstantMovOpcode"); } /// Returns the opcode for CYCLE_SEPARATOR meta instruction. /// Used for debugging purposes - virtual unsigned getCycleSeparatorOpcode() const { return -1; } + virtual unsigned getCycleSeparatorOpcode() const { + llvm_unreachable("Target didn't implement getCycleSeparatorOpcode"); + } /// Check whether Opc represents a lock instruction virtual bool isLock(unsigned Opc) const { return false; } /// Check whether this is a delayed scheduling barrier induced from @@ -124,10 +143,14 @@ struct AIEBaseInstrInfo : public TargetInstrInfo { } // Used for Load/Store combiners - virtual unsigned getOffsetMemOpcode(unsigned BaseMemOpcode) const = 0; + virtual unsigned getOffsetMemOpcode(unsigned BaseMemOpcode) const { + llvm_unreachable("Target didn't implement getOffsetMemOpcode"); + } virtual std::optional getCombinedPostIncOpcode(MachineInstr &BaseMemI, MachineInstr &PtrAddI, - TypeSize Size) const = 0; + TypeSize Size) const { + llvm_unreachable("Target didn't implement getCombinedPostIncOpcode"); + } // Opcodes related to hardware loop handling virtual bool isHardwareLoopDec(unsigned Opcode) const { return false; } From 6d9f4f9be635538ca9f53e22a2bd192a13f08f52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Thu, 4 Jul 2024 09:58:13 +0100 Subject: [PATCH 11/31] [AIEX] NFC: Add initializer_list ctor and operator== to AIE::Bundle This makes AIE::Bundle easier to use in unit tests. --- llvm/lib/Target/AIE/AIEBundle.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/llvm/lib/Target/AIE/AIEBundle.h b/llvm/lib/Target/AIE/AIEBundle.h index f3a401d49f83..c5b2c8f941d9 100644 --- a/llvm/lib/Target/AIE/AIEBundle.h +++ b/llvm/lib/Target/AIE/AIEBundle.h @@ -45,6 +45,15 @@ template class Bundle { Bundle(const AIEBaseMCFormats *FormatInterface) : FormatInterface(FormatInterface) {} + Bundle(const std::initializer_list &Instrs, + const AIEBaseMCFormats *FormatInterface) + : FormatInterface(FormatInterface) { + bool ComputeSlots = (FormatInterface != nullptr); + for (I *Instr : Instrs) { + add(Instr, std::nullopt, ComputeSlots); + } + } + /// Returns whether adding Instr to the current bundle leaves it valid. /// \param Instr instruction to add. bool canAdd(I *Instr) const { return canAdd(Instr->getOpcode()); } @@ -221,6 +230,11 @@ template class Bundle { std::vector MetaInstrs; }; +template bool operator==(const Bundle &B1, const Bundle &B2) { + return std::tie(B1.Instrs, B1.SlotMap, B1.MetaInstrs) == + std::tie(B2.Instrs, B2.SlotMap, B2.MetaInstrs); +} + using MCBundle = Bundle; using MachineBundle = Bundle; From b9e0c66b2fba5747b35dad0d7acd134a98f959b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Mon, 4 Mar 2024 13:10:42 +0000 Subject: [PATCH 12/31] [AIEX] Make computeAndFinalizeBundles resilient to pre-RA rescheduling See GenericScheduler::reschedulePhysReg(), this can actually move instructions without updating their ReadyCycle --- llvm/lib/Target/AIE/AIEMachineScheduler.cpp | 14 ++++- llvm/lib/Target/AIE/AIEMachineScheduler.h | 4 ++ .../CodeGen/ScheduleDAGMITestUtils.h | 2 +- .../Target/AIE/AIEScheduleDAGMITest.cpp | 52 +++++++++++++++++++ llvm/unittests/Target/AIE/AIETestTarget.cpp | 7 +-- 5 files changed, 74 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index 40f95f4ab998..649d115b5779 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -132,7 +132,10 @@ void bumpCycleForBundles(unsigned ToCycle, } } -std::vector computeAndFinalizeBundles(SchedBoundary &Zone) { +} // namespace + +std::vector +llvm::AIE::computeAndFinalizeBundles(SchedBoundary &Zone) { LLVM_DEBUG(dbgs() << "Computing Bundles for Zone " << (Zone.isTop() ? "Top\n" : "Bot\n")); const ScheduleDAGMI &DAG = *Zone.DAG; @@ -148,6 +151,14 @@ std::vector computeAndFinalizeBundles(SchedBoundary &Zone) { if (!SU) continue; unsigned EmitCycle = Zone.isTop() ? SU->TopReadyCycle : SU->BotReadyCycle; + + if (!ComputeSlots && EmitCycle < Bundles.size()) { + // The pre-RA scheduler can actually re-order copies and immediate + // moves, disregarding the emission cycle. + // See GenericScheduler::reschedulePhysReg(). + EmitCycle = Bundles.size(); + } + if (EmitCycle != Bundles.size()) bumpCycleForBundles(EmitCycle, Bundles, CurrBundle); @@ -197,6 +208,7 @@ std::vector computeAndFinalizeBundles(SchedBoundary &Zone) { return Bundles; } +namespace { /// Search for instructions that might jump to an unknown target block bool hasUnknownSuccessors( llvm::iterator_range Region, diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.h b/llvm/lib/Target/AIE/AIEMachineScheduler.h index b2f68a07129b..420c9fa4ff8e 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.h +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.h @@ -28,6 +28,10 @@ using BlockType = AIE::BlockType; using Region = AIE::Region; using ScoreboardTrust = AIE::ScoreboardTrust; +namespace AIE { +std::vector computeAndFinalizeBundles(SchedBoundary &Zone); +} // namespace AIE + /// A MachineSchedStrategy implementation for AIE post RA scheduling. class AIEPostRASchedStrategy : public PostGenericScheduler { /// Maintain the state of interblock/loop-aware scheduling diff --git a/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.h b/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.h index 6e6eb13096e7..173de188f82a 100644 --- a/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.h +++ b/llvm/unittests/CodeGen/ScheduleDAGMITestUtils.h @@ -78,7 +78,7 @@ class ScheduleDAGMITest : public testing::Test { /// instructions without actual name or valid MCDesc. struct MISeq { std::vector Seq; - MISeq(std::initializer_list L) : Seq(L) {} + MISeq(ArrayRef L) : Seq(L) {} MISeq(const MachineBasicBlock &MBB) { for (const MachineInstr &MI : MBB) Seq.push_back(&MI); diff --git a/llvm/unittests/Target/AIE/AIEScheduleDAGMITest.cpp b/llvm/unittests/Target/AIE/AIEScheduleDAGMITest.cpp index cacd0cce4dea..77a3e98dbe74 100644 --- a/llvm/unittests/Target/AIE/AIEScheduleDAGMITest.cpp +++ b/llvm/unittests/Target/AIE/AIEScheduleDAGMITest.cpp @@ -15,6 +15,15 @@ using namespace llvm; +namespace llvm::AIE { + +void PrintTo(const MachineBundle &B, std::ostream *OS) { + // Re-use MISeq to print something useful from MIs with dummy descriptors. + *OS << "Bundle{ " << MISeq(B.Instrs) << " }"; +} + +} // namespace llvm::AIE + namespace { class DummyAIEHazardRecognizer : public AIEHazardRecognizer { @@ -88,4 +97,47 @@ TEST_F(AIEScheduleDAGMITest, SchedWithDelta) { EXPECT_EQ(MISeq(*MBB), MISeq({MI2, MI1, MI0})); } +/// Verify the behavior of computeAndFinalizeBundles. In pre-RA scheduling, +/// it should also support the case where the SU's ReadyCycle is out-of-date +/// due to it being moved by GenericScheduler::reschedulePhysReg(). +TEST_F(AIEScheduleDAGMITest, SchedThenMove) { + auto *MI0 = appendPlainInstr(); + auto *MI1 = appendPlainInstr(); + auto *MI2 = appendPlainInstr(); + + initializeScheduler(/*IsPreRA=*/true); + SchedBoundary &Bot = Scheduler->getSchedZone(); + + // Mark all instructions as available. + for (auto *MI : {MI0, MI1, MI2}) + Bot.releaseNode(Scheduler->getSUnit(MI), /*ReadyCycle=*/0, false); + + Scheduler->scheduleInstr(MI2, Bot); + Bot.bumpCycle(2); + Scheduler->scheduleInstr(MI1, Bot); + Bot.bumpCycle(3); + Scheduler->scheduleInstr(MI0, Bot); + + EXPECT_EQ(MISeq(*MBB), MISeq({MI0, MI1, MI2})); + + // Make sure the bundles are computed as expected, c1 should be empty. + const AIEBaseMCFormats *Fmts = nullptr; + std::vector ExpectedBundles = { + AIE::MachineBundle({MI0}, Fmts), // c3 + AIE::MachineBundle({MI1}, Fmts), // c2 + AIE::MachineBundle({}, Fmts), // c1 + AIE::MachineBundle({MI2}, Fmts)}; // c0 + EXPECT_EQ(AIE::computeAndFinalizeBundles(Bot), ExpectedBundles); + + // Now move an instruction to simulate GenericScheduler::reschedulePhysReg. + // This causes the SU's ReadyCycle to be out-of-sync with its position in MBB. + Scheduler->moveInstruction(MI2, MI1); + ExpectedBundles = {AIE::MachineBundle({MI0}, Fmts), // c3 + AIE::MachineBundle({MI2, MI1}, Fmts), // c2 + AIE::MachineBundle({}, Fmts), // c1 + AIE::MachineBundle({}, Fmts)}; // c0 + EXPECT_EQ(Scheduler->getSUnit(MI2)->BotReadyCycle, 0U); + EXPECT_EQ(AIE::computeAndFinalizeBundles(Bot), ExpectedBundles); +} + } // end namespace diff --git a/llvm/unittests/Target/AIE/AIETestTarget.cpp b/llvm/unittests/Target/AIE/AIETestTarget.cpp index 4e37b1eb786d..fa281d77683f 100644 --- a/llvm/unittests/Target/AIE/AIETestTarget.cpp +++ b/llvm/unittests/Target/AIE/AIETestTarget.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "AIETestTarget.h" +#include "AIEBaseInstrInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -46,11 +47,11 @@ SubtargetSubTypeKV ProcModels[] = {SubtargetSubTypeKV{ } // namespace -class AIETestSubTarget : public TestSubTarget { +class AIETestSubTarget : public TestSubTarget { public: AIETestSubTarget(TargetMachine &TM) - : TestSubTarget(TM, "aie-test", "aie-test", ProcModels) { - } + : TestSubTarget(TM, "aie-test", "aie-test", + ProcModels) {} }; LLVMTargetMachine *llvm::AIE::createAIETestTargetMachine() { From cc1823389d47b4a64e858f68691d2896395a0375 Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Fri, 9 Aug 2024 12:23:59 +0100 Subject: [PATCH 13/31] [AIE2] Fix loop/epilogue analysis Now the DAG is constructed from the semantic order. --- .../Target/AIE/AIEInterBlockScheduling.cpp | 5 +- .../aie2/schedule/loopaware/loop-epilogue.mir | 69 +++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index b19a7545752c..98d01cc9ec64 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -485,10 +485,13 @@ int InterBlockScheduling::getCyclesToRespectTiming( for (auto &Bundle : R.Bundles) { for (MachineInstr *MI : Bundle.getInstrs()) { DistancesFromLoopEntry[MI] = DistFromLoopEntry; - Edges.addNode(MI); } ++DistFromLoopEntry; } + // Here we need to iterate using semantic order. + for (MachineInstr *MI : R) { + Edges.addNode(MI); + } }; // Construction of the superblock containing Loop+Epilogue diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-epilogue.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-epilogue.mir index 6b4c13f3ba51..271733f5da29 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-epilogue.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-epilogue.mir @@ -613,3 +613,72 @@ body: | RET implicit $lr DelayedSchedBarrier implicit $r3 ... + +# Test 10: the goal here is to test if we can handle negative latencies +# correctly. In this, case we need to be sure that we are constructing the +# DAG with the semantic order. + +--- +name: negativeLatLoop +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: negativeLatLoop + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $p0, $r0, $r1, $r2, $r3, $r4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: JNZ $r0, %bb.1 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $dj0 = MOV_mv_scl $r2 + ; CHECK-NEXT: $r1 = LDA_dms_lda_idx $p0, killed $dj0 + ; CHECK-NEXT: $r1 = OR $r2, killed $r3 + ; CHECK-NEXT: $r3 = AND $r1, $r4 + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $r1, $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $r2 = nsw ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: liveins: $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier implicit killed $r3 + bb.0: + successors: %bb.1 + liveins: $p0 + bb.1: + successors: %bb.2, %bb.1 + liveins: $p0, $r0, $r1, $r2, $r3, $r4 + $dj0 = MOV_mv_scl $r2 + $r1 = OR $r2, $r3 + $r3 = AND $r1, $r4 + $r1 = LDA_dms_lda_idx $p0, $dj0 + JNZ $r0, %bb.1 + DelayedSchedBarrier + bb.2: + successors: %bb.3 + liveins: $r1, $r3 + $r2 = nsw ADD_add_r_ri $r1, -1, implicit-def $srcarry + bb.3: + successors: + liveins: $r3 + RET implicit $lr + DelayedSchedBarrier implicit $r3 +... From e33bbe18b8ed6b608cf77a9c5677d9819ae58563 Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Fri, 9 Aug 2024 12:25:34 +0100 Subject: [PATCH 14/31] [AIE2] Fix ZOL SWP tripcount update In some cases, the immediate instruction can have more users. --- .../Target/AIE/AIEBasePipelinerLoopInfo.cpp | 3 +- .../AIE/aie2/schedule/swp/swp-zoloop.mir | 47 +++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AIE/AIEBasePipelinerLoopInfo.cpp b/llvm/lib/Target/AIE/AIEBasePipelinerLoopInfo.cpp index a89c0c101302..a95527310f0b 100644 --- a/llvm/lib/Target/AIE/AIEBasePipelinerLoopInfo.cpp +++ b/llvm/lib/Target/AIE/AIEBasePipelinerLoopInfo.cpp @@ -726,7 +726,8 @@ std::optional ZeroOverheadLoop::createTripCountGreaterCondition( void ZeroOverheadLoop::adjustTripCount(int TripCountAdjust) { LLVM_DEBUG(dbgs() << "TripCountAdjust = " << TripCountAdjust << "\n"); - if (DefTripCount->getOperand(1).isImm()) { + if (DefTripCount->getOperand(1).isImm() && + MRI.hasOneUse(DefTripCount->getOperand(0).getReg())) { // If we have a constant here, just update the value. const int64_t InitVal = DefTripCount->getOperand(1).getImm(); DefTripCount->getOperand(1).setImm(InitVal + TripCountAdjust); diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/swp/swp-zoloop.mir b/llvm/test/CodeGen/AIE/aie2/schedule/swp/swp-zoloop.mir index 593c9ebe9f6b..3d92c71adba3 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/swp/swp-zoloop.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/swp/swp-zoloop.mir @@ -168,3 +168,50 @@ body: | PseudoRET implicit $lr, implicit $r0 ... + +# Test 4: LoopStart uses a register defined by mov +# immediate, but we cannot update the immediate value +# because LoopStart is not the only user of the result + +... +--- +name: maxCanonMOV_RLC_imm10_2_users +alignment: 16 +tracksRegLiveness: true +debugInstrRef: false +liveins: [] +body: | + ; CHECK-LABEL: name: maxCanonMOV_RLC_imm10 + ; CHECK: [[INIT:%[0-9]+]]:er = MOV_RLC_imm10_pseudo 4 + ; CHECK: [[COPY:%[0-9]+]]:em = COPY [[INIT]] + ; CHECK: [[MOV:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 0 + ; CHECK: [[MOV1:%[0-9]+]]:er = MOV_RLC_imm10_pseudo 16 + ; CHECK: [[LDEC:%[0-9]+]]:er = ADD_NC_GPR [[MOV1]], -3 + ; CHECK: LoopStart [[LDEC]] + bb.1: + liveins: $p0, $p1 + %1:ep = COPY $p0 + %2:ep = COPY $p1 + %26:er = MOV_RLC_imm10_pseudo 4 + %29:em = COPY %26 + %17:er = MOV_RLC_imm11_pseudo 0 + %19:er = MOV_RLC_imm10_pseudo 16 + LoopStart %19:er + + bb.3: + %4:ep = PHI %1, %bb.1, %8, %bb.3 + %5:er = PHI %17, %bb.1, %0, %bb.3 + %9:er = LDA_dms_lda_idx_imm %4, 0 + %8:ep = PADD_mod_pseudo %4, %29 + %14:er = ABS %9, implicit-def $srcarry + %24:er27 = LT %5, %14 + %0:er = SELNEZ %14, %5, %24 + PseudoLoopEnd , %bb.3 + PseudoJ_jump_imm %bb.2 + + bb.2: + $r0 = COPY %0 + $r1 = COPY %19 + PseudoRET implicit $lr, implicit $r0, implicit $r1 + +... From d74d6a0f993eb66667004e313f44f1c15c041609 Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Mon, 12 Aug 2024 14:38:58 +0100 Subject: [PATCH 15/31] [AIE2] Fix II_VST_PACK* and II_VST_2/3D_PACK Those itineraries also read rs port. --- llvm/lib/Target/AIE/AIE2Schedule.td | 8 +++--- .../AIE/aie2/schedule/resource/w_rs.mir | 28 ++++++++++++++++++- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AIE/AIE2Schedule.td b/llvm/lib/Target/AIE/AIE2Schedule.td index 72cda64da720..69d8a5745464 100644 --- a/llvm/lib/Target/AIE/AIE2Schedule.td +++ b/llvm/lib/Target/AIE/AIE2Schedule.td @@ -838,19 +838,19 @@ MemInstrItinData, SimpleCycle [1,1,1,1,1,1,1,/*def:srSRS_of*/3,/*crSat*/1,/*crRnd*/1,/*crSRSSign*/1], MemoryCycles<7>>, MemInstrItinData, SimpleCycle], + [AvoidPartWordStore, SimpleCycle, SimpleCycle], [1,1,1,/*crSat*/1,/*crPackSign*/1], MemoryCycles<5>>, MemInstrItinData, SimpleCycle], + [AvoidPartWordStore, SimpleCycle, SimpleCycle], [1,1,1,1,/*crSat*/1,/*crPackSign*/1], MemoryCycles<5>>, MemInstrItinData, SimpleCycle], + [AvoidPartWordStore, SimpleCycle, SimpleCycle], [1,1,1,1,1,/*crSat*/1,/*crPackSign*/1], MemoryCycles<5>>, MemInstrItinData, SimpleCycle], + [AvoidPartWordStore, SimpleCycle, SimpleCycle], [1,1,1,1,1,1,/*crSat*/1,/*crPackSign*/1], MemoryCycles<5>>, // Note: VST_CONV's (bf16.fp32) store happens in E7 instead of E5. diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/resource/w_rs.mir b/llvm/test/CodeGen/AIE/aie2/schedule/resource/w_rs.mir index e544055310af..6358a70b271c 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/resource/w_rs.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/resource/w_rs.mir @@ -5,7 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # # (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates -# RUN: llc -march=aie2 -run-pass=postmisched %topdown-multi %s -o - | FileCheck %s +# RUN: llc -mtriple=aie2 -run-pass=postmisched %topdown-multi %s -o - | FileCheck %s # VST_W and VST_2D_W access W_RS_PORT at E1 and VUNPACK does so in E7. # The scheduler needs to insert NOPs to accomodate for this fact. @@ -257,3 +257,29 @@ body: | $wl2 = VPACK_D4_D8 $x3, implicit $crsat, implicit $crpacksign $wl2 = VPACK_D4_D8 $x3, implicit $crsat, implicit $crpacksign ... + +# This test triggers a structural conflict with Reading rs port (W_RS_PORT), +# forcing one more NOP to be inserted before VST_PACK, apart of the load +# latency NOPs. +# VUNPACK accesses W_RS_PORT in E7 and VST_PACK in E1. + +--- +name: VST_PACK_W_RS_E1 +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VST_PACK_W_RS_E1 + ; CHECK: $wl3, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, 0 + ; CHECK-NEXT: $x0 = VUNPACK_S16_S8 $wl3 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $p1 = VST_PACK_S8_S16_ag_pstm_nrm_imm killed $p1, 0, killed $x3, implicit $crsat + ; CHECK-NEXT: NOP + $wl3, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p0, 0 + $x0 = VUNPACK_S16_S8 $wl3 + $p1 = VST_PACK_S8_S16_ag_pstm_nrm_imm $p1, 0, $x3, implicit $crsat +... From 72df9eb5b411e6284e8238b4c42a3c3f75567983 Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Mon, 12 Aug 2024 16:04:15 +0100 Subject: [PATCH 16/31] [AIE] Cast operand in AIELoopUtils assert We need to cast to the correct type. --- llvm/lib/Target/AIE/Utils/AIELoopUtils.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AIE/Utils/AIELoopUtils.cpp b/llvm/lib/Target/AIE/Utils/AIELoopUtils.cpp index 9955c0c04d31..f52fc7d4f956 100644 --- a/llvm/lib/Target/AIE/Utils/AIELoopUtils.cpp +++ b/llvm/lib/Target/AIE/Utils/AIELoopUtils.cpp @@ -20,7 +20,8 @@ std::optional getMinTripCount(const MDNode *LoopID) { return std::nullopt; assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); - assert(LoopID->getOperand(0) == LoopID && "invalid loop"); + assert(dyn_cast(LoopID->getOperand(0)) == LoopID && + "invalid loop metadata"); int64_t MinTripCount = 0; for (unsigned I = 1, E = LoopID->getNumOperands(); I < E; ++I) { From 2cc3a87dc23eea1c9253f3a4e201653e0880f8b6 Mon Sep 17 00:00:00 2001 From: Krishnam Tibrewala Date: Mon, 24 Jun 2024 12:20:11 -0700 Subject: [PATCH 17/31] [AIE2] Add func to translate AS to MemoryBank --- llvm/lib/Target/AIE/AIE2AddrSpace.h | 46 +++++++++++++++++++++ llvm/lib/Target/AIE/AIE2Subtarget.cpp | 51 ++++++++++++++++++++++++ llvm/lib/Target/AIE/AIE2Subtarget.h | 3 ++ llvm/lib/Target/AIE/AIEBaseSubtarget.cpp | 6 +++ llvm/lib/Target/AIE/AIEBaseSubtarget.h | 3 ++ 5 files changed, 109 insertions(+) create mode 100644 llvm/lib/Target/AIE/AIE2AddrSpace.h diff --git a/llvm/lib/Target/AIE/AIE2AddrSpace.h b/llvm/lib/Target/AIE/AIE2AddrSpace.h new file mode 100644 index 000000000000..70e40c372757 --- /dev/null +++ b/llvm/lib/Target/AIE/AIE2AddrSpace.h @@ -0,0 +1,46 @@ +//===-- AIE2AddrSpace.h - Define Address Space for AIEngine V2 ---*- C++-*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// This file declares the AIEngine V2 Address Space and DM banks +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_AIE2ADDRSPACE_H +#define LLVM_SUPPORT_AIE2ADDRSPACE_H + +namespace llvm { + +namespace AIE2 { + +enum class AddressSpaces { + none, // Default address space + PM, // Address space for Program Memory (PM) + DM, // Address space for Data Memory(DM) includes Bank A, B, C, D + DM_test, + stack, // Address space for stack + a, // Address space for DM Bank A + b, // Address space for DM Bank B + c, // Address space for DM Bank C + d, // Address space for DM Bank D + ab, // Address space for DM Bank A and B + ac, // Address space for DM Bank A and C + ad, // Address space for DM Bank A and D + bc, // Address space for DM Bank B and C + bd, // Address space for DM Bank B and D + cd, // Address space for DM Bank C and D + TM // Address space for TM (Tile Memory) +}; + +enum class AIEBanks { A, B, C, D }; + +} // end namespace AIE2 +} // end namespace llvm + +#endif // LLVM_SUPPORT_AIE2ADDRSPACE_H diff --git a/llvm/lib/Target/AIE/AIE2Subtarget.cpp b/llvm/lib/Target/AIE/AIE2Subtarget.cpp index 88f6bee258f0..b43ff6f135c1 100644 --- a/llvm/lib/Target/AIE/AIE2Subtarget.cpp +++ b/llvm/lib/Target/AIE/AIE2Subtarget.cpp @@ -76,3 +76,54 @@ const RegisterBankInfo *AIE2Subtarget::getRegBankInfo() const { InstructionSelector *AIE2Subtarget::getInstructionSelector() const { return InstSelector.get(); } + +unsigned +AIE2Subtarget::getMemoryBanksFromAddressSpace(unsigned AddrSpace) const { + using namespace AIE2; + std::bitset<32> MemoryBanks; + + switch (static_cast(AddrSpace)) { + case AddressSpaces::a: + MemoryBanks.set(static_cast(AIEBanks::A)); + break; + case AddressSpaces::b: + MemoryBanks.set(static_cast(AIEBanks::B)); + break; + case AddressSpaces::c: + MemoryBanks.set(static_cast(AIEBanks::C)); + break; + case AddressSpaces::d: + MemoryBanks.set(static_cast(AIEBanks::D)); + break; + case AddressSpaces::ab: + MemoryBanks.set(static_cast(AIEBanks::A)) + .set(static_cast(AIEBanks::B)); + break; + case AddressSpaces::ac: + MemoryBanks.set(static_cast(AIEBanks::A)) + .set(static_cast(AIEBanks::C)); + break; + case AddressSpaces::ad: + MemoryBanks.set(static_cast(AIEBanks::A)) + .set(static_cast(AIEBanks::D)); + break; + case AddressSpaces::bc: + MemoryBanks.set(static_cast(AIEBanks::B)) + .set(static_cast(AIEBanks::C)); + break; + case AddressSpaces::bd: + MemoryBanks.set(static_cast(AIEBanks::B)) + .set(static_cast(AIEBanks::D)); + break; + case AddressSpaces::cd: + MemoryBanks.set(static_cast(AIEBanks::C)) + .set(static_cast(AIEBanks::D)); + break; + default: + // For unimplemented cases assume all + MemoryBanks.set(); + break; + } + + return MemoryBanks.to_ulong(); +} diff --git a/llvm/lib/Target/AIE/AIE2Subtarget.h b/llvm/lib/Target/AIE/AIE2Subtarget.h index 1eaa521e0f0e..0b5d5b66fad6 100644 --- a/llvm/lib/Target/AIE/AIE2Subtarget.h +++ b/llvm/lib/Target/AIE/AIE2Subtarget.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_AIE2_AIE2SUBTARGET_H #define LLVM_LIB_TARGET_AIE2_AIE2SUBTARGET_H #include "AIE2.h" +#include "AIE2AddrSpace.h" #include "AIE2FrameLowering.h" #include "AIE2ISelLowering.h" #include "AIE2InstrInfo.h" @@ -92,6 +93,8 @@ class AIE2Subtarget : public AIE2GenSubtargetInfo, public AIEBaseSubtarget { return &TSInfo; } + unsigned getMemoryBanksFromAddressSpace(unsigned AddrSpace) const override; + // Perform target-specific adjustments to the latency of a schedule // dependency. // If a pair of operands is associated with the schedule dependency, DefOpIdx diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp index f5ed84291139..04cbe2933c7b 100644 --- a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp +++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp @@ -133,6 +133,12 @@ void AIEBaseSubtarget::adjustSchedDependency( } } +unsigned +AIEBaseSubtarget::getMemoryBanksFromAddressSpace(unsigned AddrSpace) const { + // By default assume there are no conflicts. + return 0; +} + namespace { // Set latency and declare height/depth dirty if it changes diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.h b/llvm/lib/Target/AIE/AIEBaseSubtarget.h index 5c0955f23b18..9abd02ed21da 100644 --- a/llvm/lib/Target/AIE/AIEBaseSubtarget.h +++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.h @@ -21,6 +21,7 @@ #include "llvm/CodeGenTypes/MachineValueType.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/TargetParser/Triple.h" +#include namespace llvm { @@ -66,6 +67,8 @@ class AIEBaseSubtarget { int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep) const; + virtual unsigned getMemoryBanksFromAddressSpace(unsigned AddrSpace) const; + /// Required DAG mutations during Post-RA scheduling. static std::vector> getPostRAMutationsImpl(const Triple &TT); From 6d327f93c0bc95fd48f766180c86b72bacc133f7 Mon Sep 17 00:00:00 2001 From: Krishnam Tibrewala Date: Tue, 9 Jul 2024 11:55:06 -0700 Subject: [PATCH 18/31] [AIEX] llvm/TableGen: Backend to generate mem ops scheduling info This uses MemInstrItinData records to generate InstrInfo::getMemoryCycles(unsigned SchedClass) --- llvm/include/llvm/Target/AIETarget.td | 13 +- llvm/lib/Target/AIE/AIE2InstrInfo.h | 2 + llvm/lib/Target/AIE/AIE2Schedule.td | 158 +++++++++--------- llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp | 5 + llvm/lib/Target/AIE/AIEBaseInstrInfo.h | 12 +- llvm/test/TableGen/aie-memory-cycles-none.td | 7 + llvm/test/TableGen/aie-memory-cycles.td | 16 +- .../utils/TableGen/AIEMemoryCyclesEmitter.cpp | 31 +++- 8 files changed, 150 insertions(+), 94 deletions(-) diff --git a/llvm/include/llvm/Target/AIETarget.td b/llvm/include/llvm/Target/AIETarget.td index 5e193ada343a..7a98b23e075b 100644 --- a/llvm/include/llvm/Target/AIETarget.td +++ b/llvm/include/llvm/Target/AIETarget.td @@ -20,15 +20,18 @@ class PreSchedInstExpansion { } // Scheduling information for instructions that touch memory. -class MemoryCycles { +class MemoryCycles MemCycles> { + // Cycles in which memory is accessed + list MemCyclesList = MemCycles; + // Cycle for the first load or store to memory. - int FirstCycle = First; + int FirstCycle = !head(MemCycles); // Cycle for the last load or store to memory. This is typically the same as // FirstCycle, except for instructions that are read-modify-write. In that // case, the instruction touches memory twice, and LastCycle would be // different from FirstCycle. - int LastCycle = Last; + int LastCycle = !if(!empty(!tail(MemCycles)), !head(MemCycles), MemCycles[!sub(!size(MemCycles),1)]); } // An extension of InstrItinData that is able to store MemoryCycles @@ -39,6 +42,6 @@ class MemInstrItinData stages, list bypasses = [], int uops = 1> : InstrItinData { int FirstMemCycle = MemCycles.FirstCycle; - int LastMemCycle = !if(!lt(MemCycles.LastCycle, 0), - FirstMemCycle, MemCycles.LastCycle); + int LastMemCycle = MemCycles.LastCycle; + list MemCyclesList = MemCycles.MemCyclesList; } diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.h b/llvm/lib/Target/AIE/AIE2InstrInfo.h index 7749131c0a91..287d45f9fbf7 100644 --- a/llvm/lib/Target/AIE/AIE2InstrInfo.h +++ b/llvm/lib/Target/AIE/AIE2InstrInfo.h @@ -95,6 +95,8 @@ class AIE2InstrInfo : public AIE2GenInstrInfo { int getMinLastMemoryCycle() const override; int getMaxLastMemoryCycle() const override; + SmallVector getMemoryCycles(unsigned SchedClass) const override; + SmallVector getTiedRegInfo(unsigned Opcode) const override; diff --git a/llvm/lib/Target/AIE/AIE2Schedule.td b/llvm/lib/Target/AIE/AIE2Schedule.td index 69d8a5745464..d7d56ab6bb95 100644 --- a/llvm/lib/Target/AIE/AIE2Schedule.td +++ b/llvm/lib/Target/AIE/AIE2Schedule.td @@ -408,27 +408,27 @@ MemInstrItinData, SimpleCycle, EmptyCycles<4>, PrefixCycle, SimpleCycle], [7,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle, EmptyCycles<4>, PrefixCycle, SimpleCycle], [7,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle, EmptyCycles<4>, PrefixCycle, SimpleCycle], [7,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle, EmptyCycles<4>, PrefixCycle, SimpleCycle], [7,1,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle, EmptyCycles<4>, SimpleCycle], [7,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, InstrItinData], [1,1,1,1]>, InstrItinData], [1,1,1,1]>, InstrItinData], [1,1,1,1]>, @@ -470,7 +470,7 @@ MemInstrItinData, SimpleCycle, SimpleCycle], [1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, InstrItinData, SimpleCycle, AvoidSemaphore<2>], [1,1]>, @@ -502,62 +502,62 @@ MemInstrItinData, SimpleCycle, SimpleCycle], [1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle, SimpleCycle], [1,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle, SimpleCycle], [1,1,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, PrefixCycle, SimpleCycle, SimpleCycle], [1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle], [1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle], [1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle], [1,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle], [1,1,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle, EmptyCycles<4>, PrefixCycle, SimpleCycle, SimpleCycle], [7,1,1], - MemoryCycles<5, 11>>, + MemoryCycles<[5, 11]>>, MemInstrItinData, SimpleCycle, EmptyCycles<4>, PrefixCycle, SimpleCycle, SimpleCycle], [1,7,1,1], - MemoryCycles<5, 11>>, + MemoryCycles<[5, 11]>>, MemInstrItinData, SimpleCycle, EmptyCycles<4>, PrefixCycle, SimpleCycle, SimpleCycle], [1,1,7,1,1], - MemoryCycles<5, 11>>, + MemoryCycles<[5, 11]>>, MemInstrItinData, SimpleCycle, EmptyCycles<4>, PrefixCycle, SimpleCycle, SimpleCycle], [1,1,1,7,1,1], - MemoryCycles<5, 11>>, + MemoryCycles<[5, 11]>>, InstrItinData], [1,1,1,/*def:srCarry*/1]>, InstrItinData, PrefixCycle, SimpleCycle], @@ -610,139 +610,139 @@ MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>, EmptyCycles<4>, InstrStage<1,[UPS_UNIT]>, EmptyCycles<1>, InstrStage<1,[CM_WM_PORT]>], [9,7,1,1,/*def:srUPS_of*/8,/*crSat*/8,/*crUPSSign*/7], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle, EmptyCycles<4>, SimpleCycle, EmptyCycles<1>, SimpleCycle], [9,1,7,1,1,/*def:srUPS_of*/8,/*crSat*/8,/*crUPSSign*/7], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>, EmptyCycles<4>, InstrStage<1,[UPS_UNIT]>, EmptyCycles<1>, InstrStage<1,[CM_WM_PORT]>], [9,1,1,7,1,1,/*def:srUPS_of*/8,/*crSat*/8,/*crUPSSign*/7], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>, EmptyCycles<4>, InstrStage<1,[UPS_UNIT]>, EmptyCycles<1>, InstrStage<1,[CM_WM_PORT]>], [9,1,1,1,7,1,1,/*def:srUPS_of*/8,/*crSat*/8,/*crUPSSign*/7], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>, EmptyCycles<4>, InstrStage<1, [W_WA_PORT]>], [7,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle, EmptyCycles<4>, SimpleCycle], [7,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData,InstrStage<1, [LOAD_UNIT_A]>, EmptyCycles<4>, InstrStage<1, [W_WA_PORT]>], [7,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>, EmptyCycles<4>, InstrStage<1, [W_WA_PORT]>], [7,1,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>, EmptyCycles<4>, InstrStage<1, [CM_WM_PORT]>], [7,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle, EmptyCycles<4>, SimpleCycle], [7,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>, EmptyCycles<4>, InstrStage<1, [CM_WM_PORT]>], [7,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>, EmptyCycles<4>, InstrStage<1, [CM_WM_PORT]>], [7,1,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>, EmptyCycles<4>, InstrStage<1, [CM_WM_PORT]>], [7,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>, EmptyCycles<4>, InstrStage<1, [CM_WM_PORT]>], [7,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>, EmptyCycles<4>, InstrStage<1, [CM_WM_PORT]>], [7,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>, EmptyCycles<4>, InstrStage<1, [CM_WM_PORT]>], [7,1,1,1,1,1], - MemoryCycles<5>>, -MemInstrItinData>, + MemoryCycles<[5]>>, +MemInstrItinData>, MemInstrItinData>, + MemoryCycles<[5]>>, MemInstrItinData>, + MemoryCycles<[5]>>, MemInstrItinData>, + MemoryCycles<[5]>>, MemInstrItinData>, + MemoryCycles<[5]>>, MemInstrItinData>, + MemoryCycles<[5]>>, MemInstrItinData>, + MemoryCycles<[5]>>, MemInstrItinData>, + MemoryCycles<[5]>>, MemInstrItinData>, -MemInstrItinData>, -MemInstrItinData>, -MemInstrItinData>, + MemoryCycles<[5]>>, +MemInstrItinData>, +MemInstrItinData>, +MemInstrItinData>, MemInstrItinData, SimpleCycle], [7,1], - MemoryCycles<5>>, -MemInstrItinData>, -MemInstrItinData>, -MemInstrItinData>, -MemInstrItinData>, -MemInstrItinData>, -MemInstrItinData>, -MemInstrItinData>, -MemInstrItinData>, + MemoryCycles<[5]>>, +MemInstrItinData>, +MemInstrItinData>, +MemInstrItinData>, +MemInstrItinData>, +MemInstrItinData>, +MemInstrItinData>, +MemInstrItinData>, +MemInstrItinData>, MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>], [7,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>], [7,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>], [7,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, InstrStage<1, [LOAD_UNIT_A]>], [7,1,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, InstrItinData, EmptyCycles<3>, InstrStage<1, [CM_WA_PORT]>], [5,3,1,1,1], [VEC_Bypass,VEC_Bypass,NoBypass,NoBypass]>, InstrItinData, PrefixCycle, SimpleCycle], @@ -821,57 +821,57 @@ MemInstrItinData, SimpleCycle, E AvoidPartWordStore, EmptyCycles<1>, SimpleCycle], [1,1,1,1,/*def:srSRS_of*/3,/*crSat*/1,/*crRnd*/1,/*crSRSSign*/1], - MemoryCycles<7>>, + MemoryCycles<[7]>>, MemInstrItinData, SimpleCycle, EmptyCycles<1>, AvoidPartWordStore, EmptyCycles<1>, SimpleCycle], [1,1,1,1,1,/*def:srSRS_of*/3,/*crSat*/1,/*crRnd*/1,/*crSRSSign*/1], - MemoryCycles<7>>, + MemoryCycles<[7]>>, MemInstrItinData, SimpleCycle, EmptyCycles<1>, AvoidPartWordStore, EmptyCycles<1>, SimpleCycle], [1,1,1,1,1,1,/*def:srSRS_of*/3,/*crSat*/1,/*crRnd*/1,/*crSRSSign*/1], - MemoryCycles<7>>, + MemoryCycles<[7]>>, MemInstrItinData, SimpleCycle, EmptyCycles<1>, AvoidPartWordStore, EmptyCycles<1>, SimpleCycle], [1,1,1,1,1,1,1,/*def:srSRS_of*/3,/*crSat*/1,/*crRnd*/1,/*crSRSSign*/1], - MemoryCycles<7>>, + MemoryCycles<[7]>>, MemInstrItinData, SimpleCycle], [1,1,1,/*crSat*/1,/*crPackSign*/1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle], [1,1,1,1,/*crSat*/1,/*crPackSign*/1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle], [1,1,1,1,1,/*crSat*/1,/*crPackSign*/1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle], [1,1,1,1,1,1,/*crSat*/1,/*crPackSign*/1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, // Note: VST_CONV's (bf16.fp32) store happens in E7 instead of E5. // To accomodate for this extra delay we have to move the AvoidPartWordStore // from E1 to E3. MemInstrItinData, SimpleCycle, AvoidPartWordStore, EmptyCycles<1>], [1,1,1,/*def:srF2FFlags*/1,/*crRnd*/1,/*crF2FMask*/1], - MemoryCycles<7>>, + MemoryCycles<[7]>>, MemInstrItinData, SimpleCycle, AvoidPartWordStore, EmptyCycles<1>], [1,1,1,1,/*def:srF2FFlags*/1,/*crRnd*/1,/*crF2FMask*/1], - MemoryCycles<7>>, + MemoryCycles<[7]>>, MemInstrItinData, SimpleCycle, AvoidPartWordStore, EmptyCycles<1>], [1,1,1,1,1,/*def:srF2FFlags*/1,/*crRnd*/1,/*crF2FMask*/1], - MemoryCycles<7>>, + MemoryCycles<[7]>>, MemInstrItinData, SimpleCycle, AvoidPartWordStore, EmptyCycles<1>], [1,1,1,1,1,1,/*def:srF2FFlags*/1,/*crRnd*/1,/*crF2FMask*/1], - MemoryCycles<7>>, + MemoryCycles<[7]>>, // VCONV.bf16.fp32 InstrItinData, SimpleCycle], [2,1,/*def:srF2FFlags*/1,/*def:crF2FMask*/1,/*crRnd*/1]>, @@ -897,28 +897,28 @@ InstrItinData, InstrStage<1, [W_WM_PORT]>], MemInstrItinData, SimpleCycle], [1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle], [1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle], [1,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle], [1,1,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle], [1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle], [1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle], [1,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, MemInstrItinData, SimpleCycle], [1,1,1,1,1,1], - MemoryCycles<5>>, + MemoryCycles<[5]>>, InstrItinData, InstrStage<1, [W_RS_PORT]>], [7,7,/*crUnpackSign*/7]>, InstrItinData, SimpleCycle], diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp b/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp index ddd938417d4b..c1e250e6d2e3 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp +++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp @@ -707,6 +707,11 @@ int AIEBaseInstrInfo::getMaxLastMemoryCycle() const { return std::numeric_limits().min(); } +SmallVector +AIEBaseInstrInfo::getMemoryCycles(unsigned SchedClass) const { + return {}; +} + bool AIEBaseInstrInfo::isLegalTypeToPad(const LLT &Ty, StringRef *ErrInfo) const { if (Ty.isVector() && Ty.getSizeInBits() == 128) diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h index df7bac927756..4825a33482b3 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h +++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h @@ -236,11 +236,11 @@ struct AIEBaseInstrInfo : public TargetInstrInfo { const MachineBasicBlock *MBB, const MachineFunction &MF) const override; - std::optional - getOperandLatency(const InstrItineraryData *ItinData, - const MachineInstr &DefMI, unsigned DefIdx, - const MachineInstr &UseMI, - unsigned UseIdx) const override; + std::optional getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr &DefMI, + unsigned DefIdx, + const MachineInstr &UseMI, + unsigned UseIdx) const override; // Check if the MII points to a BUNDLE which contains a call instruction bool isCallBundle(MachineBasicBlock::iterator MII) const; @@ -305,6 +305,8 @@ struct AIEBaseInstrInfo : public TargetInstrInfo { virtual int getMinLastMemoryCycle() const; /// Return the maximum of LastMemoryCycle over all sched classes virtual int getMaxLastMemoryCycle() const; + /// Return cycles for memory operations of an instruction. + virtual SmallVector getMemoryCycles(unsigned SchedClass) const; const AIEBaseMCFormats *getFormatInterface() const { return FormatInterface; } diff --git a/llvm/test/TableGen/aie-memory-cycles-none.td b/llvm/test/TableGen/aie-memory-cycles-none.td index 751622953b61..28a744b5d228 100644 --- a/llvm/test/TableGen/aie-memory-cycles-none.td +++ b/llvm/test/TableGen/aie-memory-cycles-none.td @@ -55,3 +55,10 @@ def FOO : TestInstruction { // CHECK-NEXT: return 2147483647; // CHECK-LABEL: int TestAIEInstrInfo::getMaxLastMemoryCycle() const { // CHECK-NEXT: return -2147483648; + +// CHECK: SmallVector +// CHECK-NEXT: TestAIEInstrInfo::getMemoryCycles(unsigned SchedClass) const { +// CHECK-NEXT: switch (SchedClass) { +// CHECK-NEXT: default: return {}; +// CHECK-NEXT: } +// CHECK-NEXT: } diff --git a/llvm/test/TableGen/aie-memory-cycles.td b/llvm/test/TableGen/aie-memory-cycles.td index aaa4067b051a..ac262d3fa8f7 100644 --- a/llvm/test/TableGen/aie-memory-cycles.td +++ b/llvm/test/TableGen/aie-memory-cycles.td @@ -19,9 +19,9 @@ def II_LATE_ST : InstrItinClass; def II_PART_ST : InstrItinClass; def II_OTHER : InstrItinClass; def AIEItineraries : ProcessorItineraries<[], [], [ - MemInstrItinData>, - MemInstrItinData>, - MemInstrItinData>, + MemInstrItinData>, + MemInstrItinData>, + MemInstrItinData>, InstrItinData ]>; @@ -75,3 +75,13 @@ let Itinerary = II_OTHER in // CHECK-NEXT: return 5; // CHECK-LABEL: int TestAIEInstrInfo::getMaxLastMemoryCycle() const { // CHECK-NEXT: return 11; + +// CHECK: SmallVector +// CHECK-NEXT: TestAIEInstrInfo::getMemoryCycles(unsigned SchedClass) const { +// CHECK-NEXT: switch (SchedClass) { +// CHECK-NEXT: default: return {}; +// CHECK-NEXT: case 2: return {7}; // II_LATE_ST +// CHECK-NEXT: case 3: return {5, 11}; // II_PART_ST +// CHECK-NEXT: case 4: return {5}; // II_ST +// CHECK-NEXT: } +// CHECK-NEXT: } diff --git a/llvm/utils/TableGen/AIEMemoryCyclesEmitter.cpp b/llvm/utils/TableGen/AIEMemoryCyclesEmitter.cpp index 013cacf57160..111f3d2f9a5f 100644 --- a/llvm/utils/TableGen/AIEMemoryCyclesEmitter.cpp +++ b/llvm/utils/TableGen/AIEMemoryCyclesEmitter.cpp @@ -33,6 +33,7 @@ class AIEMemoryCyclesEmitter { const CodeGenSchedClass *SchedClass; // The sched class. unsigned FirstCycle; // The cycle for the first memory operation. unsigned LastCycle; // The cycle for the last memory operation. + std::vector MemoryCyclesVec; // The cycles for memory operations. }; public: @@ -55,6 +56,7 @@ class AIEMemoryCyclesEmitter { /// Generate C++ code from \p ItinMemCycles. void emitMemoryCyclesInfo(raw_ostream &OS, MemCycleGetter &Access); + void emitAllMemoryCyclesInfo(raw_ostream &OS); /// Process the SchedClasses, looking for those with MemoryCycles. void run(raw_ostream &OS); @@ -110,8 +112,11 @@ void AIEMemoryCyclesEmitter::evaluateSchedClass( PrintFatalError(ItinData->getLoc(), "FirstMemCycle greater than LastMemCycle"); } - ItinMemCycles.emplace_back( - MemoryCycles{&SchedClass, unsigned(FirstCycle), unsigned(LastCycle)}); + std::vector MemoryCyclesVec = + ItinData->getValueAsListOfInts("MemCyclesList"); + + ItinMemCycles.emplace_back(MemoryCycles{ + &SchedClass, unsigned(FirstCycle), unsigned(LastCycle), MemoryCyclesVec}); } // Generate something like: @@ -150,6 +155,27 @@ void AIEMemoryCyclesEmitter::emitMemoryCyclesInfo(raw_ostream &OS, << EndOfFunction; } +void AIEMemoryCyclesEmitter::emitAllMemoryCyclesInfo(raw_ostream &OS) { + const std::string Prefix(std::string(Target.getName()) + "InstrInfo::"); + const std::string EndOfFunction("\n}\n\n"); + OS << "SmallVector\n"; + OS << Prefix << "getMemoryCycles(unsigned SchedClass) const {\n"; + OS << " switch (SchedClass) {\n" << " default: return {};\n"; + for (const MemoryCycles &MemCycles : ItinMemCycles) { + assert(MemCycles.MemoryCyclesVec.size() <= 2 && "Too many memory cycles"); + OS << " case " << MemCycles.SchedClass->Index << ": return "; + OS << "{"; + for (size_t i = 0; i < MemCycles.MemoryCyclesVec.size(); ++i) { + OS << MemCycles.MemoryCyclesVec[i]; + if (i != MemCycles.MemoryCyclesVec.size() - 1) + OS << ", "; + } + OS << "}"; + OS << "; // " << MemCycles.SchedClass->Name << "\n"; + } + OS << " }" << EndOfFunction; +} + void AIEMemoryCyclesEmitter::run(raw_ostream &OS) { Records.startTimer("Process definitions"); for (const CodeGenSchedClass &SchedClass : SchedModels.explicit_classes()) @@ -162,6 +188,7 @@ void AIEMemoryCyclesEmitter::run(raw_ostream &OS) { MemCycleGetter GetLast(/*LastCycles=*/true); emitMemoryCyclesInfo(OS, GetFirst); emitMemoryCyclesInfo(OS, GetLast); + emitAllMemoryCyclesInfo(OS); } static TableGen::Emitter::OptClass From c54c049e79fa41ff6c8e48e7d9d4d56e71ab6388 Mon Sep 17 00:00:00 2001 From: Krishnam Tibrewala Date: Wed, 26 Jun 2024 15:31:09 -0700 Subject: [PATCH 19/31] [AIEX] Use bank info in Hazard Recognizer to avoid bank conflict --- llvm/lib/Target/AIE/AIE2Subtarget.cpp | 15 +- llvm/lib/Target/AIE/AIE2Subtarget.h | 4 +- llvm/lib/Target/AIE/AIEBaseSubtarget.cpp | 20 +- llvm/lib/Target/AIE/AIEBaseSubtarget.h | 7 +- llvm/lib/Target/AIE/AIEHazardRecognizer.cpp | 106 +- llvm/lib/Target/AIE/AIEHazardRecognizer.h | 34 +- .../Target/AIE/AIEInterBlockScheduling.cpp | 6 +- llvm/lib/Target/AIE/AIEMachineScheduler.cpp | 3 +- .../AIE/aie2/end-to-end/Conv2D-red-swp.ll | 150 +-- .../test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll | 48 +- .../AIE/aie2/schedule/implicit_ops.mir | 8 +- .../aie2/schedule/loopaware/Add2D-like.mir | 13 +- .../aie2/schedule/memory_dependencies_st.mir | 40 +- .../schedule/memory_dependencies_st_q.mir | 40 +- .../schedule/memory_dependencies_vst_am.mir | 40 +- .../schedule/memory_dependencies_vst_conv.mir | 40 +- .../schedule/memory_dependencies_vst_pack.mir | 40 +- .../schedule/memory_dependencies_vst_srs.mir | 40 +- .../schedule/memory_dependencies_vst_w.mir | 40 +- .../AIE/aie2/schedule/pre_ra/add2d_inner.mir | 6 +- .../aie2/schedule/resource/memory_bank.mir | 1119 +++++++++++++++++ .../AIE/aie2/schedule/vlda_vldb-compr.mir | 32 +- .../AIE/aie2/schedule/vlda_vldb-sparse.mir | 32 +- .../CodeGen/AIE/aie2/schedule/vlda_vldb.mir | 54 +- .../Target/AIE/HazardRecognizerTest.cpp | 137 +- 25 files changed, 1734 insertions(+), 340 deletions(-) create mode 100644 llvm/test/CodeGen/AIE/aie2/schedule/resource/memory_bank.mir diff --git a/llvm/lib/Target/AIE/AIE2Subtarget.cpp b/llvm/lib/Target/AIE/AIE2Subtarget.cpp index b43ff6f135c1..72a880de19e0 100644 --- a/llvm/lib/Target/AIE/AIE2Subtarget.cpp +++ b/llvm/lib/Target/AIE/AIE2Subtarget.cpp @@ -77,7 +77,17 @@ InstructionSelector *AIE2Subtarget::getInstructionSelector() const { return InstSelector.get(); } -unsigned +MemoryBankBits AIE2Subtarget::getDefaultMemoryBank() const { + using namespace AIE2; + std::bitset<32> MemoryBanks; + MemoryBanks.set(static_cast(AIEBanks::A)) + .set(static_cast(AIEBanks::B)) + .set(static_cast(AIEBanks::C)) + .set(static_cast(AIEBanks::D)); + return MemoryBanks.to_ulong(); +} + +MemoryBankBits AIE2Subtarget::getMemoryBanksFromAddressSpace(unsigned AddrSpace) const { using namespace AIE2; std::bitset<32> MemoryBanks; @@ -120,8 +130,7 @@ AIE2Subtarget::getMemoryBanksFromAddressSpace(unsigned AddrSpace) const { .set(static_cast(AIEBanks::D)); break; default: - // For unimplemented cases assume all - MemoryBanks.set(); + return getDefaultMemoryBank(); break; } diff --git a/llvm/lib/Target/AIE/AIE2Subtarget.h b/llvm/lib/Target/AIE/AIE2Subtarget.h index 0b5d5b66fad6..9b094da7ef86 100644 --- a/llvm/lib/Target/AIE/AIE2Subtarget.h +++ b/llvm/lib/Target/AIE/AIE2Subtarget.h @@ -93,7 +93,9 @@ class AIE2Subtarget : public AIE2GenSubtargetInfo, public AIEBaseSubtarget { return &TSInfo; } - unsigned getMemoryBanksFromAddressSpace(unsigned AddrSpace) const override; + MemoryBankBits getDefaultMemoryBank() const override; + MemoryBankBits + getMemoryBanksFromAddressSpace(unsigned AddrSpace) const override; // Perform target-specific adjustments to the latency of a schedule // dependency. diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp index 04cbe2933c7b..e86123e6c888 100644 --- a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp +++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp @@ -13,9 +13,11 @@ //===----------------------------------------------------------------------===// #include "AIEBaseSubtarget.h" +#include "AIE2Subtarget.h" #include "AIEBaseRegisterInfo.h" #include "AIEMachineScheduler.h" #include "AIEMaxLatencyFinder.h" +#include "AIESubtarget.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/ScheduleDAG.h" @@ -133,12 +135,28 @@ void AIEBaseSubtarget::adjustSchedDependency( } } -unsigned +const AIEBaseSubtarget &AIEBaseSubtarget::get(const MachineFunction &MF) { + if (MF.getTarget().getTargetTriple().isAIE1()) + return static_cast( + MF.getSubtarget()); + else if (MF.getTarget().getTargetTriple().isAIE2()) + return static_cast( + MF.getSubtarget()); + else + llvm_unreachable("Unknown subtarget"); +} + +MemoryBankBits AIEBaseSubtarget::getMemoryBanksFromAddressSpace(unsigned AddrSpace) const { // By default assume there are no conflicts. return 0; } +MemoryBankBits AIEBaseSubtarget::getDefaultMemoryBank() const { + // By default assume there are no conflicts. + return 0; +} + namespace { // Set latency and declare height/depth dirty if it changes diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.h b/llvm/lib/Target/AIE/AIEBaseSubtarget.h index 9abd02ed21da..91279a3fcf23 100644 --- a/llvm/lib/Target/AIE/AIEBaseSubtarget.h +++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.h @@ -34,6 +34,8 @@ class ScheduleDAGMutation; class SUnit; class SDep; +using MemoryBankBits = uint64_t; + class AIEBaseSubtarget { private: Triple TargetTriple; @@ -42,6 +44,7 @@ class AIEBaseSubtarget { public: AIEBaseSubtarget(const Triple &TT) : TargetTriple(TT) {} + static const AIEBaseSubtarget &get(const MachineFunction &MF); virtual const TargetRegisterInfo *getRegisterInfo() const = 0; virtual const TargetFrameLowering *getFrameLowering() const = 0; virtual const AIEBaseInstrInfo *getInstrInfo() const = 0; @@ -67,7 +70,9 @@ class AIEBaseSubtarget { int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep) const; - virtual unsigned getMemoryBanksFromAddressSpace(unsigned AddrSpace) const; + virtual MemoryBankBits + getMemoryBanksFromAddressSpace(unsigned AddrSpace) const; + virtual MemoryBankBits getDefaultMemoryBank() const; /// Required DAG mutations during Post-RA scheduling. static std::vector> diff --git a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp index bbcc9a2e3a83..11dcca57c280 100644 --- a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp +++ b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp @@ -49,24 +49,32 @@ void FuncUnitWrapper::setFormatInterface(const AIEBaseMCFormats *Formats) { bool FuncUnitWrapper::operator==(const FuncUnitWrapper &Other) const { return Required == Other.Required && Reserved == Other.Reserved && - Slots == Other.Slots; + Slots == Other.Slots && MemoryBanks == Other.MemoryBanks; } void FuncUnitWrapper::dump() const { const char *const Digits = "0123456789"; const char *const Spacer = "-|"; const int Upper = std::numeric_limits::digits - 1; - dbgs() << "Req : "; - for (int J = Upper; J >= 0; J--) - dbgs() << ((Required & (1ULL << J)) ? Digits[J % 10] : Spacer[J % 10 == 0]); - dbgs() << " Slots : "; - for (int J = 9; J >= 0; J--) - dbgs() << ((Slots & (1ULL << J)) ? Digits[J] : '-'); + + auto printFU = [&](const std::string &FUName, InstrStage::FuncUnits FU) { + dbgs() << FUName; + for (int J = Upper; J >= 0; J--) + dbgs() << ((Required & (1ULL << J)) ? Digits[J % 10] + : Spacer[J % 10 == 0]); + }; + auto printResource = [&](const std::string &ResourceName, uint64_t Resource) { + dbgs() << ResourceName; + for (int J = 9; J >= 0; J--) + dbgs() << ((Resource & (1ULL << J)) ? Digits[J] : '-'); + }; + + printFU("Req : ", Required); + printResource(" Slots : ", Slots); + printResource(" Memorybanks : ", MemoryBanks); if (!Reserved) return; - dbgs() << "\n\t Rsrv : "; - for (int J = Upper; J >= 0; J--) - dbgs() << ((Reserved & (1ULL << J)) ? Digits[J % 10] : Spacer[J % 10 == 0]); + printFU("\n\t Rsrv : ", Reserved); } void FuncUnitWrapper::clearResources() { @@ -74,26 +82,31 @@ void FuncUnitWrapper::clearResources() { Required = 0; Reserved = 0; Slots = 0; + MemoryBanks = 0; } bool FuncUnitWrapper::isEmpty() const { - return Required == 0 && Reserved == 0 && Slots == 0; + return Required == 0 && Reserved == 0 && Slots == 0 && MemoryBanks == 0; } void FuncUnitWrapper::blockResources() { Required = ~0; Reserved = ~0; Slots = ~0; + // Since the HW stalls in the event of memory bank conflicts, we don't need to + // block the resource. It is overly conservative if we block all memory banks. } FuncUnitWrapper &FuncUnitWrapper::operator|=(const FuncUnitWrapper &Other) { Required |= Other.Required; Reserved |= Other.Reserved; Slots |= Other.Slots; + MemoryBanks |= Other.MemoryBanks; return *this; } bool FuncUnitWrapper::conflict(const FuncUnitWrapper &Other) const { if ((Required & Other.Required) != 0 || (Slots & Other.Slots) != 0 || + (MemoryBanks & Other.MemoryBanks) != 0 || (Reserved & Other.Required) != 0 || (Required & Other.Reserved) != 0) { return true; } @@ -296,8 +309,8 @@ AIEHazardRecognizer::getHazardType(SUnit *SU, int DeltaCycles) { TII->getFormatInterface()->getAlternateInstsOpcode(MI->getOpcode()); if (AlternateInsts) { for (const auto AltInstOpcode : *AlternateInsts) { - ScheduleHazardRecognizer::HazardType Haz = - getHazardType(TII->get(AltInstOpcode), DeltaCycles); + ScheduleHazardRecognizer::HazardType Haz = getHazardType( + TII->get(AltInstOpcode), getMemoryBanks(MI), DeltaCycles); // Check if there is NoHazard, If there is a Hazard or NoopHazard check // for the next possible Opcode. if (Haz == NoHazard) { @@ -311,7 +324,7 @@ AIEHazardRecognizer::getHazardType(SUnit *SU, int DeltaCycles) { return NoopHazard; } - return getHazardType(MI->getDesc(), DeltaCycles); + return getHazardType(MI->getDesc(), getMemoryBanks(MI), DeltaCycles); } bool AIEHazardRecognizer::conflict(const AIEHazardRecognizer &Other, @@ -363,7 +376,7 @@ void AIEHazardRecognizer::EmitInstruction(SUnit *SU, int DeltaCycles) { // and use the latter to update the scoreboard. unsigned SelectedOpcode = getSelectedAltOpcode(MI).value_or(MI->getOpcode()); if (!AIE::MachineBundle::isNoHazardMetaInstruction(SelectedOpcode)) - emitInScoreboard(TII->get(SelectedOpcode), DeltaCycles); + emitInScoreboard(TII->get(SelectedOpcode), getMemoryBanks(MI), DeltaCycles); // When requested, we switch off VLIW scheduling after the specified number // of instructions are scheduled. @@ -406,23 +419,27 @@ auto toHazardType(bool Conflict) { // recognizing alternatives ScheduleHazardRecognizer::HazardType AIEHazardRecognizer::getHazardType(const MCInstrDesc &Desc, + MemoryBankBits MemoryBanks, const int DeltaCycles) { - return toHazardType(checkConflict( - Scoreboard, ItinData, Desc.getSchedClass(), + return getHazardType( + Desc.getSchedClass(), getSlotSet(Desc, *TII->getFormatInterface(), IgnoreUnknownSlotSets), - DeltaCycles, FUDepthLimit)); + MemoryBanks, DeltaCycles); } ScheduleHazardRecognizer::HazardType AIEHazardRecognizer::getHazardType(unsigned SchedClass, SlotBits SlotSet, + MemoryBankBits MemoryBanks, int DeltaCycles) { - return toHazardType(checkConflict(Scoreboard, ItinData, SchedClass, SlotSet, - DeltaCycles, FUDepthLimit)); + return toHazardType(checkConflict( + Scoreboard, ItinData, SchedClass, SlotSet, MemoryBanks, + TII->getMemoryCycles(SchedClass), DeltaCycles, FUDepthLimit)); } bool AIEHazardRecognizer::checkConflict( const ResourceScoreboard &Scoreboard, const InstrItineraryData *ItinData, unsigned SchedClass, SlotBits SlotSet, + MemoryBankBits MemoryBanks, SmallVector MemoryAccessCycles, int DeltaCycles, std::optional FUDepthLimit) { assert(Scoreboard.isValidDelta(DeltaCycles)); @@ -431,6 +448,17 @@ bool AIEHazardRecognizer::checkConflict( if (EmissionCycle.conflict(Scoreboard[DeltaCycles])) return true; + // Verify memory bank hazards + if (!MemoryAccessCycles.empty()) { + FuncUnitWrapper MemoryBankAccessCycle(/*Req=*/0, /*Res=*/0, /*SlotSet=*/0, + MemoryBanks); + for (auto Cycles : MemoryAccessCycles) { + // MemoryAccessCycles starts counting from 1, so we need to subtract 1 + if (MemoryBankAccessCycle.conflict(Scoreboard[DeltaCycles + Cycles - 1])) + return true; + } + } + // Note that Delta will be negative for bottom-up scheduling. // Cycle is 'our' cycle at which each stage of the itinerary starts. // It gets updated by the increment from the InstrStage. @@ -461,25 +489,25 @@ bool AIEHazardRecognizer::checkConflict( } void AIEHazardRecognizer::emitInScoreboard(const MCInstrDesc &Desc, + MemoryBankBits MemoryBanks, int DeltaCycles) { - enterResources( - Scoreboard, ItinData, Desc.getSchedClass(), - getSlotSet(Desc, *TII->getFormatInterface(), IgnoreUnknownSlotSets), - DeltaCycles, FUDepthLimit); + emitInScoreboard(Scoreboard, Desc, MemoryBanks, DeltaCycles); } void AIEHazardRecognizer::emitInScoreboard( ResourceScoreboard &TheScoreboard, const MCInstrDesc &Desc, - int DeltaCycles) const { + MemoryBankBits MemoryBanks, int DeltaCycles) const { enterResources( TheScoreboard, ItinData, Desc.getSchedClass(), getSlotSet(Desc, *TII->getFormatInterface(), IgnoreUnknownSlotSets), - DeltaCycles, FUDepthLimit); + MemoryBanks, TII->getMemoryCycles(Desc.getSchedClass()), DeltaCycles, + FUDepthLimit); } void AIEHazardRecognizer::enterResources( ResourceScoreboard &Scoreboard, const InstrItineraryData *ItinData, unsigned SchedClass, SlotBits SlotSet, + MemoryBankBits MemoryBanks, SmallVector MemoryAccessCycles, int DeltaCycles, std::optional FUDepthLimit) { assert(Scoreboard.isValidDelta(DeltaCycles)); @@ -487,6 +515,15 @@ void AIEHazardRecognizer::enterResources( FuncUnitWrapper EmissionCycle(/*Req=*/0, /*Res=*/0, SlotSet); Scoreboard[DeltaCycles] |= EmissionCycle; + // Append memory bank usage + if (!MemoryAccessCycles.empty()) { + FuncUnitWrapper MemoryBankAccessCycle(/*Req=*/0, /*Res=*/0, /*SlotSet=*/0, + MemoryBanks); + for (auto Cycles : MemoryAccessCycles) { + Scoreboard[DeltaCycles + Cycles - 1] |= MemoryBankAccessCycle; + } + } + int Cycle = DeltaCycles; Scoreboard[Cycle].IssueCount++; for (const InstrStage &IS : ItinData->getStages(SchedClass)) { @@ -570,3 +607,20 @@ AIEHazardRecognizer::getSelectedAltOpcode(MachineInstr *MI) const { return It->second; return std::nullopt; } + +MemoryBankBits AIEHazardRecognizer::getMemoryBanks(MachineInstr *MI) const { + if (!(MI->mayLoad() || MI->mayStore())) + return 0; + + if (MI->memoperands_empty()) + return ~0; + + const AIEBaseSubtarget &STI = AIEBaseSubtarget::get(*MI->getMF()); + MemoryBankBits MemoryBankUsed = STI.getDefaultMemoryBank(); + for (auto &MMO : MI->memoperands()) { + MemoryBankBits MemoryBank = + STI.getMemoryBanksFromAddressSpace(MMO->getAddrSpace()); + MemoryBankUsed &= MemoryBank; + } + return MemoryBankUsed; +} diff --git a/llvm/lib/Target/AIE/AIEHazardRecognizer.h b/llvm/lib/Target/AIE/AIEHazardRecognizer.h index 2464531ef771..90ddc92a8121 100644 --- a/llvm/lib/Target/AIE/AIEHazardRecognizer.h +++ b/llvm/lib/Target/AIE/AIEHazardRecognizer.h @@ -44,16 +44,20 @@ class FuncUnitWrapper { /// The occupied slots. This is currently redundant with Bundle SlotBits Slots = 0; + /// The occupied bank + MemoryBankBits MemoryBanks = 0; + public: /// IssueCount - Count instructions issued in this cycle. unsigned IssueCount = 0; - FuncUnitWrapper(const InstrStage &IS, SlotBits Slots = 0) + FuncUnitWrapper(const InstrStage &IS, SlotBits Slots = 0, + MemoryBankBits MemoryBanks = 0) : Required(IS.getReservationKind() == InstrStage::Required ? IS.getUnits() : 0), Reserved(IS.getReservationKind() == InstrStage::Reserved ? IS.getUnits() : 0), - Slots(Slots) {} + Slots(Slots), MemoryBanks(MemoryBanks) {} static void setFormatInterface(const AIEBaseMCFormats *Formats); @@ -67,8 +71,8 @@ class FuncUnitWrapper { FuncUnitWrapper() = default; FuncUnitWrapper(InstrStage::FuncUnits Req) : Required(Req), Reserved(0) {} FuncUnitWrapper(InstrStage::FuncUnits Req, InstrStage::FuncUnits Res, - SlotBits Slots = 0) - : Required(Req), Reserved(Res), Slots(Slots) {} + SlotBits Slots = 0, MemoryBankBits MemoryBanks = 0) + : Required(Req), Reserved(Res), Slots(Slots), MemoryBanks(MemoryBanks) {} /// Compare two FuncUnitWrappers for equality. This is only used for /// dumping purposes, quite literally saying "this looks the same" @@ -132,9 +136,11 @@ class AIEHazardRecognizer : public ScheduleHazardRecognizer { /// use from the pre-RA scheduler, where detailed resource modelling /// doesn't pay off. void emitInScoreboard(ResourceScoreboard &Scoreboard, - const MCInstrDesc &Desc, int DeltaCycles) const; + const MCInstrDesc &Desc, MemoryBankBits MemoryBanks, + int DeltaCycles) const; // Apply the above function to the local scoreboard. - void emitInScoreboard(const MCInstrDesc &Desc, int DeltaCycles); + void emitInScoreboard(const MCInstrDesc &Desc, MemoryBankBits MemoryBanks, + int DeltaCycles); /// Block all scoreboard resources at DeltaCycles void blockCycleInScoreboard(int DeltaCycle); @@ -149,6 +155,10 @@ class AIEHazardRecognizer : public ScheduleHazardRecognizer { /// the opcode selected during scheduling. std::optional getSelectedAltOpcode(MachineInstr *MI) const; + /// The instructions with memory bank attribute return the address space + /// number + MemoryBankBits getMemoryBanks(MachineInstr *MI) const; + /// The pipeline depth is the depth of the deepest instruction. /// We compute that once from the itineraries. unsigned getPipelineDepth() const; @@ -169,19 +179,25 @@ class AIEHazardRecognizer : public ScheduleHazardRecognizer { protected: ScheduleHazardRecognizer::HazardType getHazardType(const MCInstrDesc &Desc, + MemoryBankBits MemoryBanks, + int DeltaCycles); + ScheduleHazardRecognizer::HazardType getHazardType(unsigned SchedClass, + SlotBits SlotSet, + MemoryBankBits MemoryBanks, int DeltaCycles); - ScheduleHazardRecognizer::HazardType - getHazardType(unsigned SchedClass, SlotBits SlotSet, int DeltaCycles); static bool checkConflict(const ResourceScoreboard &Scoreboard, const InstrItineraryData *ItinData, unsigned SchedClass, - SlotBits SlotSet, int DeltaCycles, + SlotBits SlotSet, MemoryBankBits MemoryBanks, + SmallVector MemoryAccessCycles, int DeltaCycles, std::optional FUDepthLimit); static void enterResources(ResourceScoreboard &Scoreboard, const InstrItineraryData *ItinData, unsigned SchedClass, SlotBits SlotSet, + MemoryBankBits MemoryBanks, + SmallVector MemoryAccessCycles, int DeltaCycles, std::optional FUDepthLimit); private: diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index 98d01cc9ec64..b56c73ddc9cd 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -59,7 +59,8 @@ void emitBundlesInScoreboard(const std::vector &Bundles, // then this will not cause conflicts. for (int i = TotalBundles - AmountToEmit; i < TotalBundles; i++) { for (MachineInstr *MI : Bundles[i].getInstrs()) - HR->emitInScoreboard(Scoreboard, MI->getDesc(), 0); + HR->emitInScoreboard(Scoreboard, MI->getDesc(), HR->getMemoryBanks(MI), + 0); Scoreboard.advance(); } @@ -77,7 +78,8 @@ void emitBundlesInScoreboardDelta( break; for (MachineInstr *MI : Bundle.getInstrs()) - HR->emitInScoreboard(Scoreboard, MI->getDesc(), Delta); + HR->emitInScoreboard(Scoreboard, MI->getDesc(), HR->getMemoryBanks(MI), + Delta); Delta++; } diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index 649d115b5779..ada17f7c66c8 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -259,7 +259,8 @@ void AIEPostRASchedStrategy::initializeBotScoreBoard(ScoreboardTrust Trust) { /// make sure we always have enough lookahead available. We arrange for that /// by starting in the earliest possible cycle, -Depth auto InsertInCycle = [=](MachineInstr &MI, int Cycle) { - BotHazardRec->emitInScoreboard(MI.getDesc(), Cycle - Depth); + BotHazardRec->emitInScoreboard( + MI.getDesc(), BotHazardRec->getMemoryBanks(&MI), Cycle - Depth); }; auto BlockCycle = [=](int Cycle) { BotHazardRec->blockCycleInScoreboard(Cycle - Depth); diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll index fc78d7e13404..8747a1d676bb 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll @@ -224,7 +224,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-LABEL: conv2d.loop.nest: ; DCL: .p2align 4 ; DCL-NEXT: // %bb.0: // %newFuncRoot -; DCL-NEXT: mova dj3, #0; nopx +; DCL-NEXT: mova dj3, #0 ; DCL-NEXT: mov s0, r0 ; DCL-NEXT: mov s1, r1 ; DCL-NEXT: mov s2, r6 @@ -232,35 +232,35 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: mov dc4, dj3 ; DCL-NEXT: mov dc1, dj3 ; DCL-NEXT: mov dc5, dj3 -; DCL-NEXT: mov dc2, dj3 -; DCL-NEXT: paddb [sp], #192; mov dc6, dj3 +; DCL-NEXT: paddb [sp], #192; mov dc2, dj3 ; DCL-NEXT: st p7, [sp, #-192] // 4-byte Folded Spill ; DCL-NEXT: mov p7, sp ; DCL-NEXT: paddb [p7], #-272; st p6, [sp, #-188] // 4-byte Folded Spill -; DCL-NEXT: lda r25, [p7, #0]; mov p6, sp +; DCL-NEXT: lda r25, [p7, #0]; mov dc6, dj3 +; DCL-NEXT: mov p6, sp ; DCL-NEXT: paddb [p6], #-292; mov dc3, dj3 ; DCL-NEXT: lda m0, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-296; mov r28, dj3 ; DCL-NEXT: lda dj0, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-300; mov dc7, dj3 -; DCL-NEXT: lda dn0, [p6, #0] -; DCL-NEXT: mov p6, sp -; DCL-NEXT: paddb [p6], #-204 +; DCL-NEXT: paddb [p6], #-300 +; DCL-NEXT: lda dn0, [p6, #0]; mov p7, sp +; DCL-NEXT: paddb [p7], #-200; mov p6, sp +; DCL-NEXT: lda m6, [p7, #0]; paddb [p6], #-204 ; DCL-NEXT: lda m0, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-208 ; DCL-NEXT: lda dj0, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-212 ; DCL-NEXT: lda dj4, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-216; mov p7, sp -; DCL-NEXT: lda dn0, [p6, #0]; st m0, [sp, #-96] // 4-byte Folded Spill -; DCL-NEXT: paddb [p7], #-200; mov p6, sp -; DCL-NEXT: lda m6, [p7, #0]; paddb [p6], #-220; st dj0, [sp, #-88] // 4-byte Folded Spill +; DCL-NEXT: paddb [p6], #-216; st m0, [sp, #-96] // 4-byte Folded Spill +; DCL-NEXT: lda dn0, [p6, #0] +; DCL-NEXT: mov p6, sp +; DCL-NEXT: paddb [p6], #-220; st dj0, [sp, #-88] // 4-byte Folded Spill ; DCL-NEXT: lda dn4, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-228 ; DCL-NEXT: lda r11, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-232; st dn0, [sp, #-92] // 4-byte Folded Spill ; DCL-NEXT: lda dj1, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-236 +; DCL-NEXT: paddb [p6], #-236; mov dc7, dj3 ; DCL-NEXT: lda r12, [p6, #0] ; DCL-NEXT: mov p6, sp ; DCL-NEXT: paddb [p6], #-240 @@ -280,18 +280,18 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: lda dn6, [p6, #0]; mov p6, sp ; DCL-NEXT: paddb [p6], #-268 ; DCL-NEXT: lda r14, [p6, #0] -; DCL-NEXT: mov p6, sp +; DCL-NEXT: vst wl0, [sp, #-64]; mov p6, sp // 32-byte Folded Spill ; DCL-NEXT: lda dj7, [sp, #-88]; paddb [p6], #-276 // 4-byte Folded Reload ; DCL-NEXT: lda dn3, [p6, #0]; mov p6, sp -; DCL-NEXT: paddb [p6], #-280 +; DCL-NEXT: paddb [p6], #-280; st dc7, [sp, #-84] // 4-byte Folded Spill ; DCL-NEXT: lda r26, [p6, #0]; mov p6, sp ; DCL-NEXT: lda dn7, [sp, #-92]; paddb [p6], #-196 // 4-byte Folded Reload ; DCL-NEXT: lda r15, [p6, #0]; paddb [p7], #-288; mov p6, sp ; DCL-NEXT: lda r27, [p7, #0]; paddb [p6], #-224 -; DCL-NEXT: lda r24, [p6, #0]; vst wl0, [sp, #-64] // 32-byte Folded Spill +; DCL-NEXT: lda r24, [p6, #0] ; DCL-NEXT: vst wh0, [sp, #-32]; mov p6, sp // 32-byte Folded Spill -; DCL-NEXT: paddb [p6], #-284; st dc7, [sp, #-84] // 4-byte Folded Spill -; DCL-NEXT: lda m4, [p6, #0]; st m7, [sp, #-96] // 4-byte Folded Spill +; DCL-NEXT: paddb [p6], #-284; st m7, [sp, #-96] // 4-byte Folded Spill +; DCL-NEXT: lda m4, [p6, #0] ; DCL-NEXT: st dj7, [sp, #-88]; movx r8, #11 // 4-byte Folded Spill ; DCL-NEXT: st dn7, [sp, #-92]; movx r9, #31 // 4-byte Folded Spill ; DCL-NEXT: // implicit-def: $x4 @@ -347,19 +347,20 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vlda wl10, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2 ; DCL-NEXT: vlda wh10, [p1], #32; and r10, r10, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 // Delay Slot 1 ; DCL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 -; DCL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0 +; DCL-NEXT: nopa ; nopx ; vmov x11, x0 ; DCL-NEXT: vshuffle x0, x4, x2, r3 ; DCL-NEXT: vshuffle x11, x0, x11, r8 -; DCL-NEXT: nop -; DCL-NEXT: vlda wl0, [sp, #-64]; vst wl11, [sp, #-160] // 32-byte Folded Reload32-byte Folded Spill -; DCL-NEXT: vlda wl11, [sp, #-160]; vst wh11, [sp, #-128] // 32-byte Folded Reload32-byte Folded Spill +; DCL-NEXT: vlda wl0, [sp, #-64] // 32-byte Folded Reload +; DCL-NEXT: vst wl11, [sp, #-160] // 32-byte Folded Spill +; DCL-NEXT: vst wh11, [sp, #-128] // 32-byte Folded Spill +; DCL-NEXT: vlda wl11, [sp, #-160] // 32-byte Folded Reload ; DCL-NEXT: vlda wh11, [sp, #-128] // 32-byte Folded Reload -; DCL-NEXT: vlda wl6, [sp, #-160]; vmac cm0, cm0, x7, x6, r4 // 32-byte Folded Reload -; DCL-NEXT: vlda wh6, [sp, #-128]; vmac cm1, cm1, x9, x6, r4 // 32-byte Folded Reload -; DCL-NEXT: vlda wh0, [sp, #-32]; vmac cm2, cm2, x0, x6, r4 // 32-byte Folded Reload -; DCL-NEXT: lda dn7, [sp, #-92]; vmac cm5, cm6, x0, x8, r4 // 4-byte Folded Reload -; DCL-NEXT: vmac cm4, cm5, x9, x8, r4 -; DCL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm8, cm4, x7, x8, r4 // 4-byte Folded Reload +; DCL-NEXT: vlda wl6, [sp, #-160]; vmac cm2, cm2, x0, x6, r4 // 32-byte Folded Reload +; DCL-NEXT: vlda wh6, [sp, #-128]; vmac cm5, cm6, x0, x8, r4 // 32-byte Folded Reload +; DCL-NEXT: vlda wh0, [sp, #-32]; vmac cm4, cm5, x9, x8, r4 // 32-byte Folded Reload +; DCL-NEXT: lda dn7, [sp, #-92]; vmac cm8, cm4, x7, x8, r4 // 4-byte Folded Reload +; DCL-NEXT: vmac cm0, cm0, x7, x6, r4 +; DCL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm1, cm1, x9, x6, r4 // 4-byte Folded Reload ; DCL-NEXT: vshift.align x2, x2, s1, x3, r0; vmac cm3, cm3, x11, x6, r4 ; DCL-NEXT: vshuffle x6, x4, x2, r2 ; DCL-NEXT: vmac cm6, cm7, x6, x8, r4 @@ -368,19 +369,19 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vshuffle x3, x4, x2, r3; vmac cm0, cm1, x8, x1, r4 ; DCL-NEXT: st dj7, [sp, #-88] // 4-byte Folded Spill ; DCL-NEXT: vshuffle x5, x3, x0, r8; vmac cm1, cm2, x3, x1, r4 -; DCL-NEXT: vst.srs.s16.s32 bmh7, s2, [p3, #32]; mov s3, r6 -; DCL-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x1, r4 +; DCL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bmh7, s2, [p3, #32]; mov s3, r6 // 4-byte Folded Reload +; DCL-NEXT: lda dc7, [sp, #-84]; vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x1, r4 // 4-byte Folded Reload ; DCL-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32] ; DCL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x10, r4 ; DCL-NEXT: vst.srs.s16.s32 bmh1, s3, [p3, #32] -; DCL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bml1, s3, [p3], #64; vmac cm8, cm4, x8, x10, r4 // 4-byte Folded Reload -; DCL-NEXT: lda dc7, [sp, #-84]; vst.srs.s16.s32 bmh2, s3, [p3, #32]; mov m1, r27 // 4-byte Folded Reload +; DCL-NEXT: vst.srs.s16.s32 bml1, s3, [p3], #64; mov m1, r27; vmac cm8, cm4, x8, x10, r4 +; DCL-NEXT: vst.srs.s16.s32 bmh2, s3, [p3, #32] ; DCL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x3, x10, r4 -; DCL-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32]; mov dj5, r12 -; DCL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; mov m2, r13; vmac cm4, cm6, x5, x10, r4 -; DCL-NEXT: vst.srs.s16.s32 bmh8, s3, [p3, #32]; mov m3, r14 -; DCL-NEXT: vst.srs.s16.s32 bml8, s3, [p3], m4 -; DCL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32] +; DCL-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32] +; DCL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; vmac cm4, cm6, x5, x10, r4 +; DCL-NEXT: vst.srs.s16.s32 bmh8, s3, [p3, #32]; mov dj5, r12 +; DCL-NEXT: vst.srs.s16.s32 bml8, s3, [p3], m4; mov m2, r13 +; DCL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov m3, r14 ; DCL-NEXT: vst.srs.s16.s32 bml5, s3, [p3], #64; mov m1, r11 ; DCL-NEXT: padda.3d [p0], d1; vst.srs.s16.s32 bmh4, s3, [p3, #32]; mov m1, r24 ; DCL-NEXT: padda.2d [p3], d7; vst.srs.s16.s32 bml4, s3, [p3, #0]; add r7, r7, #-1; mov dj7, r25 @@ -403,7 +404,7 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-LABEL: conv2d.loop.nest: ; ZOL: .p2align 4 ; ZOL-NEXT: // %bb.0: // %newFuncRoot -; ZOL-NEXT: mova dj3, #0; nopx +; ZOL-NEXT: mova dj3, #0 ; ZOL-NEXT: mov s0, r0 ; ZOL-NEXT: mov s1, r1 ; ZOL-NEXT: mov s2, r6 @@ -411,35 +412,35 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: mov dc4, dj3 ; ZOL-NEXT: mov dc1, dj3 ; ZOL-NEXT: mov dc5, dj3 -; ZOL-NEXT: mov dc2, dj3 -; ZOL-NEXT: paddb [sp], #192; mov dc6, dj3 +; ZOL-NEXT: paddb [sp], #192; mov dc2, dj3 ; ZOL-NEXT: st p7, [sp, #-192] // 4-byte Folded Spill ; ZOL-NEXT: mov p7, sp ; ZOL-NEXT: paddb [p7], #-272; st p6, [sp, #-188] // 4-byte Folded Spill -; ZOL-NEXT: lda r24, [p7, #0]; mov p6, sp +; ZOL-NEXT: lda r24, [p7, #0]; mov dc6, dj3 +; ZOL-NEXT: mov p6, sp ; ZOL-NEXT: paddb [p6], #-292; mov dc3, dj3 ; ZOL-NEXT: lda m0, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-296; mov r27, dj3 ; ZOL-NEXT: lda dj0, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-300; mov dc7, dj3 -; ZOL-NEXT: lda dn0, [p6, #0] -; ZOL-NEXT: mov p6, sp -; ZOL-NEXT: paddb [p6], #-204 +; ZOL-NEXT: paddb [p6], #-300 +; ZOL-NEXT: lda dn0, [p6, #0]; mov p7, sp +; ZOL-NEXT: paddb [p7], #-200; mov p6, sp +; ZOL-NEXT: lda m6, [p7, #0]; paddb [p6], #-204 ; ZOL-NEXT: lda m0, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-208 ; ZOL-NEXT: lda dj0, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-212 ; ZOL-NEXT: lda dj4, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-216; mov p7, sp -; ZOL-NEXT: lda dn0, [p6, #0]; st m0, [sp, #-96] // 4-byte Folded Spill -; ZOL-NEXT: paddb [p7], #-200; mov p6, sp -; ZOL-NEXT: lda m6, [p7, #0]; paddb [p6], #-220; st dj0, [sp, #-88] // 4-byte Folded Spill +; ZOL-NEXT: paddb [p6], #-216; st m0, [sp, #-96] // 4-byte Folded Spill +; ZOL-NEXT: lda dn0, [p6, #0] +; ZOL-NEXT: mov p6, sp +; ZOL-NEXT: paddb [p6], #-220; st dj0, [sp, #-88] // 4-byte Folded Spill ; ZOL-NEXT: lda dn4, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-228 ; ZOL-NEXT: lda r10, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-232; st dn0, [sp, #-92] // 4-byte Folded Spill ; ZOL-NEXT: lda dj1, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-236 +; ZOL-NEXT: paddb [p6], #-236; mov dc7, dj3 ; ZOL-NEXT: lda r11, [p6, #0] ; ZOL-NEXT: mov p6, sp ; ZOL-NEXT: paddb [p6], #-240 @@ -459,18 +460,18 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: lda dn6, [p6, #0]; mov p6, sp ; ZOL-NEXT: paddb [p6], #-268 ; ZOL-NEXT: lda r13, [p6, #0] -; ZOL-NEXT: mov p6, sp +; ZOL-NEXT: vst wl0, [sp, #-64]; mov p6, sp // 32-byte Folded Spill ; ZOL-NEXT: lda dj7, [sp, #-88]; paddb [p6], #-276 // 4-byte Folded Reload ; ZOL-NEXT: lda dn3, [p6, #0]; mov p6, sp -; ZOL-NEXT: paddb [p6], #-280 +; ZOL-NEXT: paddb [p6], #-280; st dc7, [sp, #-84] // 4-byte Folded Spill ; ZOL-NEXT: lda r25, [p6, #0]; mov p6, sp ; ZOL-NEXT: lda dn7, [sp, #-92]; paddb [p6], #-196 // 4-byte Folded Reload ; ZOL-NEXT: lda r14, [p6, #0]; paddb [p7], #-288; mov p6, sp ; ZOL-NEXT: lda r26, [p7, #0]; paddb [p6], #-224 -; ZOL-NEXT: lda r15, [p6, #0]; vst wl0, [sp, #-64] // 32-byte Folded Spill +; ZOL-NEXT: lda r15, [p6, #0] ; ZOL-NEXT: vst wh0, [sp, #-32]; mov p6, sp // 32-byte Folded Spill -; ZOL-NEXT: paddb [p6], #-284; st dc7, [sp, #-84] // 4-byte Folded Spill -; ZOL-NEXT: lda m4, [p6, #0]; st m7, [sp, #-96] // 4-byte Folded Spill +; ZOL-NEXT: paddb [p6], #-284; st m7, [sp, #-96] // 4-byte Folded Spill +; ZOL-NEXT: lda m4, [p6, #0] ; ZOL-NEXT: st dj7, [sp, #-88]; movx r8, #11 // 4-byte Folded Spill ; ZOL-NEXT: st dn7, [sp, #-92]; movx r9, #31 // 4-byte Folded Spill ; ZOL-NEXT: // implicit-def: $x4 @@ -527,19 +528,20 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: .L_LEnd0: ; ZOL-NEXT: nopb ; vlda wh10, [p1], #32; nops ; and r1, r1, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 ; ZOL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 -; ZOL-NEXT: nopa ; nopb ; nopx ; vmov x11, x0 +; ZOL-NEXT: nopa ; nopx ; vmov x11, x0 ; ZOL-NEXT: vshuffle x0, x4, x2, r3 ; ZOL-NEXT: vshuffle x11, x0, x11, r8 -; ZOL-NEXT: nop -; ZOL-NEXT: vlda wl0, [sp, #-64]; vst wl11, [sp, #-160] // 32-byte Folded Reload32-byte Folded Spill -; ZOL-NEXT: vlda wl11, [sp, #-160]; vst wh11, [sp, #-128] // 32-byte Folded Reload32-byte Folded Spill +; ZOL-NEXT: vlda wl0, [sp, #-64] // 32-byte Folded Reload +; ZOL-NEXT: vst wl11, [sp, #-160] // 32-byte Folded Spill +; ZOL-NEXT: vst wh11, [sp, #-128] // 32-byte Folded Spill +; ZOL-NEXT: vlda wl11, [sp, #-160] // 32-byte Folded Reload ; ZOL-NEXT: vlda wh11, [sp, #-128] // 32-byte Folded Reload -; ZOL-NEXT: vlda wl6, [sp, #-160]; vmac cm0, cm0, x7, x6, r4 // 32-byte Folded Reload -; ZOL-NEXT: vlda wh6, [sp, #-128]; vmac cm1, cm1, x9, x6, r4 // 32-byte Folded Reload -; ZOL-NEXT: vlda wh0, [sp, #-32]; vmac cm2, cm2, x0, x6, r4 // 32-byte Folded Reload -; ZOL-NEXT: lda dn7, [sp, #-92]; vmac cm5, cm6, x0, x8, r4 // 4-byte Folded Reload -; ZOL-NEXT: vmac cm4, cm5, x9, x8, r4 -; ZOL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm8, cm4, x7, x8, r4 // 4-byte Folded Reload +; ZOL-NEXT: vlda wl6, [sp, #-160]; vmac cm2, cm2, x0, x6, r4 // 32-byte Folded Reload +; ZOL-NEXT: vlda wh6, [sp, #-128]; vmac cm5, cm6, x0, x8, r4 // 32-byte Folded Reload +; ZOL-NEXT: vlda wh0, [sp, #-32]; vmac cm4, cm5, x9, x8, r4 // 32-byte Folded Reload +; ZOL-NEXT: lda dn7, [sp, #-92]; vmac cm8, cm4, x7, x8, r4 // 4-byte Folded Reload +; ZOL-NEXT: vmac cm0, cm0, x7, x6, r4 +; ZOL-NEXT: lda dj7, [sp, #-88]; vshift.align x4, x4, s1, x5, r0; vmac cm1, cm1, x9, x6, r4 // 4-byte Folded Reload ; ZOL-NEXT: vshift.align x2, x2, s1, x3, r0; vmac cm3, cm3, x11, x6, r4 ; ZOL-NEXT: vshuffle x6, x4, x2, r2 ; ZOL-NEXT: vmac cm6, cm7, x6, x8, r4 @@ -548,19 +550,19 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: vshuffle x3, x4, x2, r3; vmac cm0, cm1, x8, x1, r4 ; ZOL-NEXT: st dj7, [sp, #-88] // 4-byte Folded Spill ; ZOL-NEXT: vshuffle x5, x3, x0, r8; vmac cm1, cm2, x3, x1, r4 -; ZOL-NEXT: vst.srs.s16.s32 bmh7, s2, [p3, #32]; mov s3, r6 -; ZOL-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x1, r4 +; ZOL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bmh7, s2, [p3, #32]; mov s3, r6 // 4-byte Folded Reload +; ZOL-NEXT: lda dc7, [sp, #-84]; vst.srs.s16.s32 bml7, s3, [p3], #64; vmac cm2, cm3, x5, x1, r4 // 4-byte Folded Reload ; ZOL-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32] ; ZOL-NEXT: vst.srs.s16.s32 bml0, s3, [p3], m4; vmac cm3, cm8, x6, x10, r4 ; ZOL-NEXT: vst.srs.s16.s32 bmh1, s3, [p3, #32] -; ZOL-NEXT: lda m7, [sp, #-96]; vst.srs.s16.s32 bml1, s3, [p3], #64; vmac cm8, cm4, x8, x10, r4 // 4-byte Folded Reload -; ZOL-NEXT: lda dc7, [sp, #-84]; vst.srs.s16.s32 bmh2, s3, [p3, #32]; mov m1, r26 // 4-byte Folded Reload +; ZOL-NEXT: vst.srs.s16.s32 bml1, s3, [p3], #64; mov m1, r26; vmac cm8, cm4, x8, x10, r4 +; ZOL-NEXT: vst.srs.s16.s32 bmh2, s3, [p3, #32] ; ZOL-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m1; vmac cm5, cm5, x3, x10, r4 -; ZOL-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32]; mov dj5, r11 -; ZOL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; mov m2, r12; vmac cm4, cm6, x5, x10, r4 -; ZOL-NEXT: vst.srs.s16.s32 bmh8, s3, [p3, #32]; mov m3, r13 -; ZOL-NEXT: vst.srs.s16.s32 bml8, s3, [p3], m4 -; ZOL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32] +; ZOL-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32] +; ZOL-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64; vmac cm4, cm6, x5, x10, r4 +; ZOL-NEXT: vst.srs.s16.s32 bmh8, s3, [p3, #32]; mov dj5, r11 +; ZOL-NEXT: vst.srs.s16.s32 bml8, s3, [p3], m4; mov m2, r12 +; ZOL-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov m3, r13 ; ZOL-NEXT: vst.srs.s16.s32 bml5, s3, [p3], #64; mov m1, r10 ; ZOL-NEXT: padda.3d [p0], d1; vst.srs.s16.s32 bmh4, s3, [p3, #32]; mov m1, r15 ; ZOL-NEXT: padda.2d [p3], d7; vst.srs.s16.s32 bml4, s3, [p3, #0]; add r7, r7, #-1; mov dj7, r24 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll index a292a15d449f..8bbe2b744c83 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll @@ -18,21 +18,21 @@ declare <32 x i8> @llvm.aie2.I256.v32.acc32.srs(<16 x i64>, i32, i32) declare <16 x i64> @llvm.aie2.v32acc32() ; AA-LABEL: Function: mul2d -; AA: NoAlias: <8 x i32>* %in_ptr0.addr.058, <8 x i32>* %in_ptr1.addr.057 -; AA: NoAlias: <8 x i32>* %in_ptr0.addr.058, <32 x i8>* %out_ptr.addr.056 -; AA: NoAlias: <8 x i32>* %in_ptr1.addr.057, <32 x i8>* %out_ptr.addr.056 -; AA: MayAlias: <8 x i32>* %13, <8 x i32>* %in_ptr0.addr.058 -; AA: NoAlias: <8 x i32>* %13, <8 x i32>* %in_ptr1.addr.057 -; AA: NoAlias: <8 x i32>* %13, <32 x i8>* %out_ptr.addr.056 -; AA: NoAlias: <8 x i32>* %add.ptr.i, <8 x i32>* %in_ptr0.addr.058 -; AA: NoAlias: <8 x i32>* %add.ptr.i, <8 x i32>* %in_ptr1.addr.057 -; AA: NoAlias: <8 x i32>* %add.ptr.i, <32 x i8>* %out_ptr.addr.056 -; AA: NoAlias: <8 x i32>* %13, <8 x i32>* %add.ptr.i -; AA: NoAlias: <32 x i8>* %add.ptr, <8 x i32>* %in_ptr0.addr.058 -; AA: NoAlias: <32 x i8>* %add.ptr, <8 x i32>* %in_ptr1.addr.057 -; AA: NoAlias: <32 x i8>* %add.ptr, <32 x i8>* %out_ptr.addr.056 -; AA: NoAlias: <8 x i32>* %13, <32 x i8>* %add.ptr -; AA: NoAlias: <32 x i8>* %add.ptr, <8 x i32>* %add.ptr.i +; AA: NoAlias: <8 x i32> addrspace(5)* %in_ptr0.addr.058.ascast, <8 x i32> addrspace(5)* %in_ptr1.addr.057.ascast +; AA: NoAlias: <8 x i32> addrspace(5)* %in_ptr0.addr.058.ascast, <32 x i8> addrspace(6)* %out_ptr.addr.056.ascast +; AA: NoAlias: <8 x i32> addrspace(5)* %in_ptr1.addr.057.ascast, <32 x i8> addrspace(6)* %out_ptr.addr.056.ascast +; AA: MayAlias: <8 x i32> addrspace(5)* %ascast.13, <8 x i32> addrspace(5)* %in_ptr0.addr.058.ascast +; AA: NoAlias: <8 x i32> addrspace(5)* %ascast.13, <8 x i32> addrspace(5)* %in_ptr1.addr.057.ascast +; AA: NoAlias: <8 x i32> addrspace(5)* %ascast.13, <32 x i8> addrspace(6)* %out_ptr.addr.056.ascast +; AA: NoAlias: <8 x i32> addrspace(5)* %ascast.add.ptr.i, <8 x i32> addrspace(5)* %in_ptr0.addr.058.ascast +; AA: NoAlias: <8 x i32> addrspace(5)* %ascast.add.ptr.i, <8 x i32> addrspace(5)* %in_ptr1.addr.057.ascast +; AA: NoAlias: <8 x i32> addrspace(5)* %ascast.add.ptr.i, <32 x i8> addrspace(6)* %out_ptr.addr.056.ascast +; AA: NoAlias: <8 x i32> addrspace(5)* %ascast.13, <8 x i32> addrspace(5)* %ascast.add.ptr.i +; AA: NoAlias: <32 x i8> addrspace(6)* %add.ptr.ascast, <8 x i32> addrspace(5)* %in_ptr0.addr.058.ascast +; AA: NoAlias: <32 x i8> addrspace(6)* %add.ptr.ascast, <8 x i32> addrspace(5)* %in_ptr1.addr.057.ascast +; AA: NoAlias: <32 x i8> addrspace(6)* %add.ptr.ascast, <32 x i8> addrspace(6)* %out_ptr.addr.056.ascast +; AA: NoAlias: <32 x i8> addrspace(6)* %add.ptr.ascast, <8 x i32> addrspace(5)* %ascast.13 +; AA: NoAlias: <32 x i8> addrspace(6)* %add.ptr.ascast, <8 x i32> addrspace(5)* %ascast.add.ptr.i ; Two vectors are loaded and fed to a vmul: one through a vlda.postinc, the ; other through a vlda.3d. @@ -140,14 +140,16 @@ define void @mul2d(ptr noalias %in_ptr0, ptr noalias %in_ptr1, ptr noalias %out_ %out_ptr.addr.056 = phi ptr [ %out_ptr, %for.body.lr.ph ], [ %add.ptr21, %for.body ] %itr_left_cnt0.055 = phi i32 [ 0, %for.body.lr.ph ], [ %29, %for.body ] %itr_left_cnt1.054 = phi i32 [ 0, %for.body.lr.ph ], [ %31, %for.body ] - %7 = load <8 x i32>, ptr %in_ptr0.addr.058, align 32 + %in_ptr0.addr.058.ascast = addrspacecast ptr %in_ptr0.addr.058 to ptr addrspace(5) + %7 = load <8 x i32>, ptr addrspace(5) %in_ptr0.addr.058.ascast, align 32 %8 = trunc i32 %itr_left_cnt0.055 to i20 %9 = trunc i32 %itr_left_cnt1.054 to i20 %10 = tail call { ptr, i20, i20 } @llvm.aie2.add.3d(ptr nonnull %in_ptr0.addr.058, i20 %2, i20 %3, i20 %4, i20 %5, i20 %8, i20 %6, i20 %9) %11 = extractvalue { ptr, i20, i20 } %10, 1 %12 = extractvalue { ptr, i20, i20 } %10, 2 %13 = extractvalue { ptr, i20, i20 } %10, 0 - %14 = load <8 x i32>, ptr %in_ptr1.addr.057, align 32 + %in_ptr1.addr.057.ascast = addrspacecast ptr %in_ptr1.addr.057 to ptr addrspace(5) + %14 = load <8 x i32>, ptr addrspace(5) %in_ptr1.addr.057.ascast, align 32 %add.ptr.i = getelementptr inbounds i8, ptr %in_ptr1.addr.057, i20 32 %15 = tail call <16 x i64> @llvm.aie2.v32acc32() %16 = tail call <64 x i8> @llvm.aie2.v64int8() @@ -160,16 +162,19 @@ define void @mul2d(ptr noalias %in_ptr0, ptr noalias %in_ptr1, ptr noalias %out_ %23 = bitcast <16 x i32> %21 to <64 x i8> %24 = tail call <16 x i64> @llvm.aie2.I512.I512.acc32.mul.conf(<64 x i8> %23, <16 x i32> %22, i32 808) %25 = tail call <32 x i8> @llvm.aie2.I256.v32.acc32.srs(<16 x i64> %24, i32 %conv5, i32 %conv.i.i.i) - store <32 x i8> %25, ptr %out_ptr.addr.056, align 32 + %out_ptr.addr.056.ascast = addrspacecast ptr %out_ptr.addr.056 to ptr addrspace(6) + store <32 x i8> %25, ptr addrspace(6) %out_ptr.addr.056.ascast , align 32 %add.ptr = getelementptr inbounds i8, ptr %out_ptr.addr.056, i20 32 - %26 = load <8 x i32>, ptr %13, align 32 + %ascast.13 = addrspacecast ptr %13 to ptr addrspace(5) + %26 = load <8 x i32>, ptr addrspace(5) %ascast.13, align 32 %27 = tail call { ptr, i20, i20 } @llvm.aie2.add.3d(ptr nonnull %13, i20 %2, i20 %3, i20 %4, i20 %5, i20 %11, i20 %6, i20 %12) %28 = extractvalue { ptr, i20, i20 } %27, 1 %29 = zext i20 %28 to i32 %30 = extractvalue { ptr, i20, i20 } %27, 2 %31 = zext i20 %30 to i32 %32 = extractvalue { ptr, i20, i20 } %27, 0 - %33 = load <8 x i32>, ptr %add.ptr.i, align 32 + %ascast.add.ptr.i = addrspacecast ptr %add.ptr.i to ptr addrspace(5) + %33 = load <8 x i32>, ptr addrspace(5) %ascast.add.ptr.i, align 32 %add.ptr.i39 = getelementptr inbounds i8, ptr %in_ptr1.addr.057, i20 64 %34 = tail call <16 x i32> @llvm.aie2.set.I512.I256(<8 x i32> %26, i32 0) %35 = tail call <16 x i32> @llvm.aie2.upd.I512.I256(<16 x i32> %34, <8 x i32> %19, i32 1) @@ -177,7 +182,8 @@ define void @mul2d(ptr noalias %in_ptr0, ptr noalias %in_ptr1, ptr noalias %out_ %37 = bitcast <16 x i32> %35 to <64 x i8> %38 = tail call <16 x i64> @llvm.aie2.I512.I512.acc32.mul.conf(<64 x i8> %37, <16 x i32> %36, i32 808) %39 = tail call <32 x i8> @llvm.aie2.I256.v32.acc32.srs(<16 x i64> %38, i32 %conv5, i32 %conv.i.i.i) - store <32 x i8> %39, ptr %add.ptr, align 32 + %add.ptr.ascast = addrspacecast ptr %add.ptr to ptr addrspace(6) + store <32 x i8> %39, ptr addrspace(6) %add.ptr.ascast, align 32 %add.ptr21 = getelementptr inbounds i8, ptr %out_ptr.addr.056, i20 64 %lsr.iv.next = add nsw i32 %lsr.iv, -1 %exitcond.not = icmp eq i32 %lsr.iv.next, 0 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/implicit_ops.mir b/llvm/test/CodeGen/AIE/aie2/schedule/implicit_ops.mir index 0628054d8f0c..4c8d23622dd9 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/implicit_ops.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/implicit_ops.mir @@ -58,8 +58,8 @@ body: | bb.0.entry: ; CHECK-LABEL: name: implicit_WAW ; CHECK: BUNDLE implicit-def $wl0, implicit-def $x0, implicit-def $wh0, implicit killed $p0, implicit killed $p1 { - ; CHECK-NEXT: $wl0 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0, implicit-def $x0 :: (load (<8 x s32>) from stack - 64) - ; CHECK-NEXT: $wh0 = VLDB_dmw_ldb_ag_idx_imm killed $p1, 0 :: (load (<8 x s32>) from stack - 32) + ; CHECK-NEXT: $wl0 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0, implicit-def $x0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: $wh0 = VLDB_dmw_ldb_ag_idx_imm killed $p1, 0 :: (load (<8 x s32>), addrspace 6) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -67,6 +67,6 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP - $wl0 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0, implicit-def $x0 :: (load (<8 x s32>) from stack - 64) - $wh0 = VLDB_dmw_ldb_ag_idx_imm $p1, 0 :: (load (<8 x s32>) from stack - 32) + $wl0 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0, implicit-def $x0 :: (load (<8 x s32>), addrspace 5) + $wh0 = VLDB_dmw_ldb_ag_idx_imm $p1, 0 :: (load (<8 x s32>), addrspace 6) ... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir index 9211647b6e94..8fbd694a3988 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir @@ -47,26 +47,23 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $cm0, $cm4, $dc0, $dc4, $dj0, $dj4, $dn0, $dn4, $m0, $m1, $p1, $p2, $p3, $r0, $r1, $s0, $s1, $d0_3d:0x000000000001C870 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUNDLE implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit-def $p1, implicit-def $srups_of, implicit-def $r1, implicit-def dead $srcarry, implicit $s1, implicit killed $p1, implicit $m1, implicit $crsat, implicit $crupssign, implicit killed $r1 { - ; CHECK-NEXT: renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) - ; CHECK-NEXT: renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry - ; CHECK-NEXT: } - ; CHECK-NEXT: BUNDLE implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit-def $p2, implicit-def $dc0, implicit-def $dc4, implicit-def $srups_of, implicit-def $cm8, implicit-def $bml8, implicit-def $amll8, implicit-def $amlh8, implicit-def $bmh8, implicit-def $amhl8, implicit-def $amhh8, implicit $s1, implicit killed $p2, implicit $d0_3d, implicit $crsat, implicit $crupssign, implicit $r1, implicit $cm0, implicit $r0 { + ; CHECK-NEXT: renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; CHECK-NEXT: BUNDLE implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit-def $p2, implicit-def $dc0, implicit-def $dc4, implicit-def $srups_of, implicit-def $cm8, implicit-def $bml8, implicit-def $amll8, implicit-def $amlh8, implicit-def $bmh8, implicit-def $amhl8, implicit-def $amhh8, implicit $s1, implicit killed $p2, implicit $d0_3d, implicit $crsat, implicit $crupssign, implicit $cm0, implicit $r0 { ; CHECK-NEXT: $cm4, $p2, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s1, killed $p2, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) - ; CHECK-NEXT: JNZ renamable $r1, %bb.1 ; CHECK-NEXT: renamable $cm8 = VADD internal renamable $cm4, renamable $cm0, renamable $r0 ; CHECK-NEXT: } + ; CHECK-NEXT: renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry + ; CHECK-NEXT: JNZ renamable $r1, %bb.1 ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP + ; CHECK-NEXT: renamable $p3 = VST_SRS_D8_S32_ag_pstm_nrm_imm killed renamable $p3, 32, killed renamable $cm8, renamable $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP - ; CHECK-NEXT: renamable $p3 = VST_SRS_D8_S32_ag_pstm_nrm_imm killed renamable $p3, 32, killed renamable $cm8, renamable $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: DelayedSchedBarrier ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: liveins: $cm0, $cm4, $p3, $r0, $s0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: NOP ; CHECK-NEXT: renamable $cm0 = VADD killed renamable $cm4, killed renamable $cm0, killed renamable $r0 ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_st.mir b/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_st.mir index 9a21a02d93a2..a6094c0ab8fc 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_st.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_st.mir @@ -35,28 +35,28 @@ body: | bb.0.entry: ; CHECK-LABEL: name: STORE_E5_LOAD_E5 ; CHECK: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: ST_dms_sts_idx_imm $r0, $p0, 0 ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: $p1 = ST_dms_sts_pstm_nrm_imm $r0, killed $p1, 0 ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: $p2, $dc0 = ST_2D_dms_sts $r0, killed $p2, killed $d0 ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: $p3, $dc1, $dc5 = ST_3D_dms_sts killed $r0, killed $p3, killed $d1_3d ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit killed $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -64,20 +64,20 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ST_dms_sts_idx_imm $r0, $p0, 0 ; II_ST - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p1 = ST_dms_sts_pstm_nrm_imm $r0, $p1, 0 ; II_ST_POST_1D - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p2, $dc0 = ST_2D_dms_sts $r0, $p2, $d0 ; II_ST_2D - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p3, $dc1, $dc5 = ST_3D_dms_sts $r0, $p3, $d1_3d ; II_ST_3D - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ... # Tests II_ST, II_ST_POST_1D, II_ST_2D, II_ST_3D diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_st_q.mir b/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_st_q.mir index 288cac8a3929..28a2b3a48455 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_st_q.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_st_q.mir @@ -32,28 +32,28 @@ body: | bb.0.entry: ; CHECK-LABEL: name: STORE_E5_LOAD_E5 ; CHECK: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: ST_dmv_sts_q_ag_idx_imm $q0, $p0, 0 ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: $p1 = ST_dmv_sts_q_ag_pstm_nrm_imm $q0, killed $p1, 0 ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: $p2, $dc0 = ST_2D_dmv_sts_q $q0, killed $p2, killed $d0 ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: $p3, $dc1, $dc5 = ST_3D_dmv_sts_q killed $q0, killed $p3, killed $d1_3d ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit killed $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -61,20 +61,20 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ST_dmv_sts_q_ag_idx_imm $q0, $p0, 0 ; II_ST_Q - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p1 = ST_dmv_sts_q_ag_pstm_nrm_imm $q0, $p1, 0 ; II_ST_POSTINC_Q - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p2, $dc0 = ST_2D_dmv_sts_q $q0, $p2, $d0 ; II_ST_2D_Q - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p3, $dc1, $dc5 = ST_3D_dmv_sts_q $q0, $p3, $d1_3d ; II_ST_3D_Q - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ... # Tests II_ST_Q, II_ST_POSTINC_Q, II_ST_2D_Q, II_ST_3D_Q diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_am.mir b/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_am.mir index 27c28c464ed7..5f3a0c62199d 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_am.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_am.mir @@ -32,28 +32,28 @@ body: | bb.0.entry: ; CHECK-LABEL: name: AM_STORE_E5_LOAD_E5 ; CHECK: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: VST_dmw_sts_am_ag_idx_imm $amll0, $p0, 0 ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: $p1 = VST_dmw_sts_am_ag_pstm_nrm_imm $amll0, killed $p1, 0 ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: $p2, $dc0 = VST_2D_dmw_sts_am $amll0, killed $p2, killed $d0 ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: $p3, $dc1, $dc5 = VST_3D_dmw_sts_am killed $amll0, killed $p3, killed $d1_3d ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit killed $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -61,20 +61,20 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) VST_dmw_sts_am_ag_idx_imm $amll0, $p0, 0 ; II_VST_AM - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p1 = VST_dmw_sts_am_ag_pstm_nrm_imm $amll0, $p1, 0 ; II_VST_POSTINC_AM - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p2, $dc0 = VST_2D_dmw_sts_am $amll0, $p2, $d0 ; II_VST_2D_AM - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p3, $dc1, $dc5 = VST_3D_dmw_sts_am $amll0, $p3, $d1_3d ; II_VST_3D_AM - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ... # Tests II_VST_AM, II_VST_POSTINC_AM, II_VST_2D_AM, II_VST_3D_AM diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_conv.mir b/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_conv.mir index 555d5a6dd697..40da53584b04 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_conv.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_conv.mir @@ -32,36 +32,36 @@ body: | bb.0.entry: ; CHECK-LABEL: name: CONV_STORE_E7_LOAD_E5 ; CHECK: BUNDLE implicit-def $r2, implicit-def $wl2, implicit-def $srf2fflags, implicit $p0, implicit $bml0, implicit $crrnd, implicit $crf2fmask { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: VST_CONV_BF16_FP32_ag_idx_imm $p0, 0, $bml0, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit-def $p1, implicit-def $srf2fflags, implicit $p0, implicit killed $p1, implicit $bml0, implicit $crrnd, implicit $crf2fmask { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: $p1 = VST_CONV_BF16_FP32_ag_pstm_nrm_imm killed $p1, 0, $bml0, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit-def $p2, implicit-def $dc0, implicit-def $srf2fflags, implicit $p0, implicit killed $p2, implicit killed $d0, implicit $bml0, implicit $crrnd, implicit $crf2fmask { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: $p2, $dc0 = VST_CONV_2D_BF16_FP32 killed $p2, killed $d0, $bml0, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit-def $p3, implicit-def $dc1, implicit-def $dc5, implicit-def $srf2fflags, implicit $p0, implicit killed $p3, implicit killed $d1_3d, implicit killed $bml0, implicit $crrnd, implicit $crf2fmask { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: $p3, $dc1, $dc5 = VST_CONV_3D_BF16_FP32 killed $p3, killed $d1_3d, killed $bml0, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit killed $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -69,20 +69,20 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) VST_CONV_BF16_FP32_ag_idx_imm $p0, 0, $bml0, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p1 = VST_CONV_BF16_FP32_ag_pstm_nrm_imm $p1, 0, $bml0, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p2, $dc0 = VST_CONV_2D_BF16_FP32 $p2, $d0, $bml0, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p3, $dc1, $dc5 = VST_CONV_3D_BF16_FP32 $p3, $d1_3d, $bml0, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ... # Tests II_VST_CONV, II_VST_CONV_POSTINC, II_VST_2D_CONV, II_VST_3D_CONV diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_pack.mir b/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_pack.mir index 0c552ea72c90..691dc77d3389 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_pack.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_pack.mir @@ -35,28 +35,28 @@ body: | bb.0.entry: ; CHECK-LABEL: name: PACK_STORE_E5_LOAD_E5 ; CHECK: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: VST_PACK_D4_D8_ag_idx_imm $p0, 0, $x0, implicit $crsat, implicit $crpacksign ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: $p1 = VST_PACK_D4_D8_ag_pstm_nrm_imm killed $p1, 0, $x0, implicit $crsat, implicit $crpacksign ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: $p2, $dc0 = VST_2D_PACK_D4_D8 killed $p2, killed $d0, $x0, implicit $crsat, implicit $crpacksign ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: $p3, $dc1, $dc5 = VST_3D_PACK_D4_D8 killed $p3, killed $d1_3d, killed $x0, implicit $crsat, implicit $crpacksign ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit killed $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -64,20 +64,20 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) VST_PACK_D4_D8_ag_idx_imm $p0, 0, $x0, implicit $crsat, implicit $crpacksign - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p1 = VST_PACK_D4_D8_ag_pstm_nrm_imm $p1, 0, $x0, implicit $crsat, implicit $crpacksign - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p2, $dc0 = VST_2D_PACK_D4_D8 $p2, $d0, $x0, implicit $crsat, implicit $crpacksign - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p3, $dc1, $dc5 = VST_3D_PACK_D4_D8 $p3, $d1_3d, $x0, implicit $crsat, implicit $crpacksign - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ... # Tests II_VST_PACK, II_VST_POSTINC_PACK, II_VST_2D_PACK, II_VST_3D_PACK diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_srs.mir b/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_srs.mir index 0e7f25386b78..070e23147817 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_srs.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_srs.mir @@ -32,36 +32,36 @@ body: | bb.0.entry: ; CHECK-LABEL: name: SRS_STORE_E7_LOAD_E5 ; CHECK: BUNDLE implicit-def $r2, implicit-def $wl2, implicit-def $srsrs_of, implicit $p0, implicit $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit $crsrssign { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: VST_SRS_D8_S32_ag_idx_imm $p0, 0, $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit-def $p1, implicit-def $srsrs_of, implicit $p0, implicit killed $p1, implicit $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit $crsrssign { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: $p1 = VST_SRS_D8_S32_ag_pstm_nrm_imm killed $p1, 0, $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit-def $p2, implicit-def $dc0, implicit-def $srsrs_of, implicit $p0, implicit killed $p2, implicit killed $d0, implicit $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit $crsrssign { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: $p2, $dc0 = VST_2D_SRS_D8_S32 killed $p2, killed $d0, $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit-def $p3, implicit-def $dc1, implicit-def $dc5, implicit-def $srsrs_of, implicit $p0, implicit killed $p3, implicit killed $d1_3d, implicit killed $cm0, implicit killed $s0, implicit $crsat, implicit $crrnd, implicit $crsrssign { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: $p3, $dc1, $dc5 = VST_3D_SRS_D8_S32 killed $p3, killed $d1_3d, killed $cm0, killed $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit killed $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -69,20 +69,20 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) VST_SRS_D8_S32_ag_idx_imm $p0, 0, $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p1 = VST_SRS_D8_S32_ag_pstm_nrm_imm $p1, 0, $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p2, $dc0 = VST_2D_SRS_D8_S32 $p2, $d0, $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p3, $dc1, $dc5 = VST_3D_SRS_D8_S32 $p3, $d1_3d, $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ... # Tests II_VST_SRS, II_VST_POSTINC_SRS, II_VST_2D_SRS, II_VST_3D_SRS diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_w.mir b/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_w.mir index 318bcae8832f..d9c51b10aa35 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_w.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/memory_dependencies_vst_w.mir @@ -32,28 +32,28 @@ body: | bb.0.entry: ; CHECK-LABEL: name: W_STORE_E5_LOAD_E5 ; CHECK: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm $wl0, $p0, 0 ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: $p1 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl0, killed $p1, 0 ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: $p2, $dc0 = VST_2D_dmw_sts_w $wl0, killed $p2, killed $d0 ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: $p3, $dc1, $dc5 = VST_3D_dmw_sts_w killed $wl0, killed $p3, killed $d1_3d ; CHECK-NEXT: BUNDLE implicit-def $r2, implicit-def $wl2, implicit killed $p0 { - ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 - ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -61,20 +61,20 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) VST_dmw_sts_w_ag_idx_imm $wl0, $p0, 0 ; II_VST_W - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p1 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl0, $p1, 0 ; II_VST_POSTINC_W - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p2, $dc0 = VST_2D_dmw_sts_w $wl0, $p2, $d0 ; II_VST_2D_W - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) $p3, $dc1, $dc5 = VST_3D_dmw_sts_w $wl0, $p3, $d1_3d ; II_VST_3D_W - $r2 = LDA_dms_lda_idx_imm $p0, 0 - $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $r2 = LDA_dms_lda_idx_imm $p0, 0 :: (load (s32), addrspace 6) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) ... # Tests II_VST_W, II_VST_POSTINC_W, II_VST_2D_W, II_VST_3D_W diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir index 3e5609195bee..94c83d3dcbdc 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/add2d_inner.mir @@ -52,13 +52,13 @@ body: | ; CHECK-NEXT: [[VADD2:%[0-9]+]]:acc1024 = VADD [[COPY7]], [[COPY19]], [[COPY14]] ; CHECK-NEXT: [[COPY20:%[0-9]+]]:acc1024 = COPY [[COPY1]] ; CHECK-NEXT: [[VADD3:%[0-9]+]]:acc1024 = VADD [[COPY8]], [[COPY20]], [[COPY14]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD1]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD2]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) ; CHECK-NEXT: [[COPY11:%[0-9]+]]:ep_as_32bit = VST_SRS_D8_S32_ag_pstm_nrm_imm [[COPY11]], 32, [[VADD3]], [[COPY16]], implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc1024, [[COPY9:%[0-9]+]]:ep_as_32bit = VLDA_UPS_S32_D8_ag_pstm_nrm [[COPY15]], [[COPY9]], [[COPY]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:acc1024, [[COPY10:%[0-9]+]]:ep_as_32bit, [[COPY12:%[0-9]+]].sub_dim_count:eds, [[COPY12:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_UPS_S32_D8 [[COPY15]], [[COPY10]], [[COPY12]], implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/resource/memory_bank.mir b/llvm/test/CodeGen/AIE/aie2/schedule/resource/memory_bank.mir new file mode 100644 index 000000000000..e0e881440fc2 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/resource/memory_bank.mir @@ -0,0 +1,1119 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -march=aie2 -run-pass=postmisched %s -o - | FileCheck %s + +# This test checks that we are scheduling loads from differnt banks in on VLIW bundle. + +--- +name: VLDA_VLDB_noBank_BankA +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_noBank_BankA + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>)) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>)) +... + +--- +name: VLDA_VLDB_BankA_noBank +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_BankA_noBank + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>)) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>)) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_VLDB_same_bankA +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankA + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_VLDB_same_bankB +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankB + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_same_bankC +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankC + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_same_bankD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_same_bankAB +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankAB + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) +... + +--- +name: VLDA_VLDB_same_bankAC +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankAC + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) +... + +--- +name: VLDA_VLDB_same_bankAD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankAD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) +... + +--- +name: VLDA_VLDB_same_bankBC +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankBC + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) +... + +--- +name: VLDA_VLDB_same_bankBD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankBD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) +... + +--- +name: VLDA_VLDB_same_bankCD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankCD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) +... + +--- +name: VLDA_VLDB_different_bank_A_and_B +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_A_and_B + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_different_bank_A_and_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_A_and_C + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_different_bank_A_and_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_A_and_D + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_different_bank_B_and_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_B_and_C + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_different_bank_B_and_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_B_and_D + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_different_bank_C_and_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_C_and_D + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + + +--- +name: VLDA_VLDB_different_bank_A_B_C_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_A_B_C_D + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: } + ; CHECK-NEXT: BUNDLE implicit-def $wh2, implicit-def $wh1, implicit killed $p0 { + ; CHECK-NEXT: $wh2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: $wh1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + $wh1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) + $wh2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + + +--- +name: VLDA_VLDB_different_bank_AB_A +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_A + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_VLDB_different_bank_AB_B +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_B + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_different_bank_AB_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_C + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + + +--- +name: VLDA_VLDB_different_bank_AB_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_D + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_different_bank_AC_A +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_A + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_VLDB_different_bank_AC_B +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_B + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_different_bank_AC_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_C + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_different_bank_AC_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_D + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_different_bank_AD_A +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AD_A + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_VLDB_different_bank_AD_B +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AD_B + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_different_bank_AD_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AD_C + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_different_bank_AD_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AD_D + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_different_bank_BC_A +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BC_A + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_VLDB_different_bank_BC_B +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BC_B + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_different_bank_BC_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BC_C + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_different_bank_BC_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BC_D + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_different_bank_BD_A +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BD_A + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_VLDB_different_bank_BD_B +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BD_B + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_different_bank_BD_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BD_C + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_different_bank_BD_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BD_D + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_different_bank_CD_A +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_CD_A + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_VLDB_different_bank_CD_B +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_CD_B + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_different_bank_CD_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_CD_C + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_different_bank_CD_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_CD_D + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + + +--- +name: VLDA_VLDB_different_bank_AB_AC +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_AC + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) +... + +--- +name: VLDA_VLDB_different_bank_AB_AD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_AD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) +... + +--- +name: VLDA_VLDB_different_bank_AB_BC +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_BC + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) +... + + +--- +name: VLDA_VLDB_different_bank_AB_BD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_BD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) +... + +--- +name: VLDA_VLDB_different_bank_AB_CD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_CD + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) +... + +--- +name: VLDA_VLDB_different_bank_AC_AD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_AD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) +... + +--- +name: VLDA_VLDB_different_bank_AC_BC +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_BC + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) +... + +--- +name: VLDA_VLDB_different_bank_AC_BD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_BD + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) +... + +--- +name: VLDA_VLDB_different_bank_AC_CD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_CD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) +... + +--- +name: VLDA_VLDB_different_bank_AD_BC +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AD_BC + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) +... + +--- +name: VLDA_VLDB_different_bank_AD_BD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AD_BD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) +... + +--- +name: VLDA_VLDB_different_bank_AD_CD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AD_CD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) +... + +--- +name: VLDA_VLDB_different_bank_BC_BD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BC_BD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) +... + +--- +name: VLDA_VLDB_different_bank_BC_CD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BC_CD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) +... + + + +--- +name: VLDA_VLDB_different_bank_ABCD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_ABCD + ; CHECK: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit $p0 { + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: } + ; CHECK-NEXT: BUNDLE implicit-def $wh2, implicit-def $wh1, implicit killed $p0 { + ; CHECK-NEXT: $wh2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: $wh1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + $wh1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) + $wh2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb-compr.mir b/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb-compr.mir index b584dde565fc..d23ca66b431d 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb-compr.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb-compr.mir @@ -17,8 +17,8 @@ body: | ; CHECK-LABEL: name: II_VLDA_II_VLDB_COMPR_FILL ; CHECK: VST_dmw_sts_w_ag_idx_imm killed $wl0, $p0, 0 ; CHECK-NEXT: BUNDLE implicit-def $wl2, implicit-def $p0, implicit killed $p1, implicit killed $p0, implicit killed $dp { - ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 - ; CHECK-NEXT: $p0 = VLDB_COMPR_FILL killed $p0, implicit killed $dp + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $p0 = VLDB_COMPR_FILL killed $p0, implicit killed $dp :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -27,8 +27,8 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP VST_dmw_sts_w_ag_idx_imm $wl0, $p0, 0 - $p0 = VLDB_COMPR_FILL $p0, implicit $dp - $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 + $p0 = VLDB_COMPR_FILL $p0, implicit $dp :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 :: (load (<8 x s32>), addrspace 6) ... --- @@ -39,8 +39,8 @@ body: | ; CHECK-LABEL: name: II_VLDA_II_VLDB_COMPR_RESET ; CHECK: VST_dmw_sts_w_ag_idx_imm killed $wl0, $p0, 0 ; CHECK-NEXT: BUNDLE implicit-def $wl2, implicit-def $p0, implicit-def $dp, implicit killed $p1, implicit killed $p0, implicit killed $dp { - ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 - ; CHECK-NEXT: $p0 = VLDB_COMPR_RESET killed $p0, implicit-def $dp, implicit killed $dp + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $p0 = VLDB_COMPR_RESET killed $p0, implicit-def $dp, implicit killed $dp :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -49,8 +49,8 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP VST_dmw_sts_w_ag_idx_imm $wl0, $p0, 0 - $p0 = VLDB_COMPR_RESET $p0, implicit-def $dp, implicit $dp - $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 + $p0 = VLDB_COMPR_RESET $p0, implicit-def $dp, implicit $dp :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 :: (load (<8 x s32>), addrspace 6) ... --- @@ -61,8 +61,8 @@ body: | ; CHECK-LABEL: name: II_VLDA_II_VLDB_COMPR_PEEK ; CHECK: VST_dmw_sts_w_ag_idx_imm killed $wl0, $p0, 0 ; CHECK-NEXT: BUNDLE implicit-def $wl2, implicit-def $p0, implicit-def $wh0, implicit-def $srcompr_uf, implicit killed $p1, implicit killed $p0, implicit killed $dp { - ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 - ; CHECK-NEXT: $p0, $wh0 = VLDB_COMPR_PEEK killed $p0, implicit-def $srcompr_uf, implicit killed $dp + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $p0, $wh0 = VLDB_COMPR_PEEK killed $p0, implicit-def $srcompr_uf, implicit killed $dp :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -71,8 +71,8 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP VST_dmw_sts_w_ag_idx_imm $wl0, $p0, 0 - $p0, $wh0 = VLDB_COMPR_PEEK $p0, implicit-def $srcompr_uf, implicit $dp - $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 + $p0, $wh0 = VLDB_COMPR_PEEK $p0, implicit-def $srcompr_uf, implicit $dp :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 :: (load (<8 x s32>), addrspace 6) ... --- @@ -83,8 +83,8 @@ body: | ; CHECK-LABEL: name: II_VLDA_II_VLDB_COMPR_POP ; CHECK: VST_dmw_sts_w_ag_idx_imm killed $wl0, $p0, 0 ; CHECK-NEXT: BUNDLE implicit-def $wl2, implicit-def $p0, implicit-def $wh0, implicit-def $srcompr_uf, implicit-def $dp, implicit killed $p1, implicit killed $p0, implicit killed $dp { - ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 - ; CHECK-NEXT: $p0, $wh0 = VLDB_COMPR_POP killed $p0, implicit-def $srcompr_uf, implicit-def $dp, implicit killed $dp + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $p0, $wh0 = VLDB_COMPR_POP killed $p0, implicit-def $srcompr_uf, implicit-def $dp, implicit killed $dp :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -93,6 +93,6 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP VST_dmw_sts_w_ag_idx_imm $wl0, $p0, 0 - $p0, $wh0 = VLDB_COMPR_POP $p0, implicit-def $srcompr_uf, implicit-def $dp, implicit $dp - $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 + $p0, $wh0 = VLDB_COMPR_POP $p0, implicit-def $srcompr_uf, implicit-def $dp, implicit $dp :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 :: (load (<8 x s32>), addrspace 6) ... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb-sparse.mir b/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb-sparse.mir index 9fb648c90e4e..7d4dee974a20 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb-sparse.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb-sparse.mir @@ -17,8 +17,8 @@ body: | ; CHECK-LABEL: name: II_VLDA_II_VLDB_SPARSE_FILL ; CHECK: VST_dmw_sts_w_ag_idx_imm killed $wl0, $p0, 0 ; CHECK-NEXT: BUNDLE implicit-def $wl2, implicit-def $p0, implicit-def $srsparse_of, implicit killed $p1, implicit killed $p0, implicit killed $dp { - ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 - ; CHECK-NEXT: $p0 = VLDB_SPARSE_FILL_4 killed $p0, implicit-def $srsparse_of, implicit killed $dp + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $p0 = VLDB_SPARSE_FILL_4 killed $p0, implicit-def $srsparse_of, implicit killed $dp :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -27,8 +27,8 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP VST_dmw_sts_w_ag_idx_imm $wl0, $p0, 0 - $p0 = VLDB_SPARSE_FILL_4 $p0, implicit-def $srsparse_of, implicit $dp - $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 + $p0 = VLDB_SPARSE_FILL_4 $p0, implicit-def $srsparse_of, implicit $dp :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 :: (load (<8 x s32>), addrspace 6) ... --- @@ -39,8 +39,8 @@ body: | ; CHECK-LABEL: name: II_VLDA_II_VLDB_SPARSE_RESET ; CHECK: VST_dmw_sts_w_ag_idx_imm killed $wl0, $p0, 0 ; CHECK-NEXT: BUNDLE implicit-def $wl2, implicit-def $p0, implicit-def $srsparse_of, implicit-def $dp, implicit killed $p1, implicit killed $p0, implicit killed $dp { - ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 - ; CHECK-NEXT: $p0 = VLDB_SPARSE_RESET_4 killed $p0, implicit-def $srsparse_of, implicit-def $dp, implicit killed $dp + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $p0 = VLDB_SPARSE_RESET_4 killed $p0, implicit-def $srsparse_of, implicit-def $dp, implicit killed $dp :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -49,8 +49,8 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP VST_dmw_sts_w_ag_idx_imm $wl0, $p0, 0 - $p0 = VLDB_SPARSE_RESET_4 $p0, implicit-def $srsparse_of, implicit-def $dp, implicit $dp - $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 + $p0 = VLDB_SPARSE_RESET_4 $p0, implicit-def $srsparse_of, implicit-def $dp, implicit $dp :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 :: (load (<8 x s32>), addrspace 6) ... --- @@ -61,8 +61,8 @@ body: | ; CHECK-LABEL: name: II_VLDA_II_VLDB_SPARSE_PEEK ; CHECK: VST_dmw_sts_w_ag_idx_imm killed $wl0, $p0, 0 ; CHECK-NEXT: BUNDLE implicit-def $wl2, implicit-def $p0, implicit-def $qwh0, implicit-def $qh0, implicit-def $wh0, implicit-def $srsparse_of, implicit-def $srcompr_uf, implicit killed $p1, implicit killed $p0, implicit killed $dp { - ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 - ; CHECK-NEXT: $p0, $qwh0 = VLDB_SPARSE_PEEK_4 killed $p0, implicit-def $srsparse_of, implicit-def $srcompr_uf, implicit killed $dp + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $p0, $qwh0 = VLDB_SPARSE_PEEK_4 killed $p0, implicit-def $srsparse_of, implicit-def $srcompr_uf, implicit killed $dp :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -71,8 +71,8 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP VST_dmw_sts_w_ag_idx_imm $wl0, $p0, 0 - $p0, $qwh0 = VLDB_SPARSE_PEEK_4 $p0, implicit-def $srsparse_of, implicit-def $srcompr_uf, implicit $dp - $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 + $p0, $qwh0 = VLDB_SPARSE_PEEK_4 $p0, implicit-def $srsparse_of, implicit-def $srcompr_uf, implicit $dp :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 :: (load (<8 x s32>), addrspace 6) ... --- @@ -83,8 +83,8 @@ body: | ; CHECK-LABEL: name: II_VLDA_II_VLDB_SPARSE_POP ; CHECK: VST_dmw_sts_w_ag_idx_imm killed $wl0, $p0, 0 ; CHECK-NEXT: BUNDLE implicit-def $wl2, implicit-def $p0, implicit-def $qwh0, implicit-def $qh0, implicit-def $wh0, implicit-def $srsparse_of, implicit-def $srcompr_uf, implicit-def $dp, implicit killed $p1, implicit killed $p0, implicit killed $dp { - ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 - ; CHECK-NEXT: $p0, $qwh0 = VLDB_SPARSE_POP_4 killed $p0, implicit-def $srsparse_of, implicit-def $srcompr_uf, implicit-def $dp, implicit killed $dp + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $p0, $qwh0 = VLDB_SPARSE_POP_4 killed $p0, implicit-def $srsparse_of, implicit-def $srcompr_uf, implicit-def $dp, implicit killed $dp :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -93,6 +93,6 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP VST_dmw_sts_w_ag_idx_imm $wl0, $p0, 0 - $p0, $qwh0 = VLDB_SPARSE_POP_4 $p0, implicit-def $srsparse_of, implicit-def $srcompr_uf, implicit-def $dp, implicit $dp - $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 + $p0, $qwh0 = VLDB_SPARSE_POP_4 $p0, implicit-def $srsparse_of, implicit-def $srcompr_uf, implicit-def $dp, implicit $dp :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 :: (load (<8 x s32>), addrspace 6) ... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb.mir b/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb.mir index 54170293659b..900a762b23dd 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb.mir @@ -17,8 +17,8 @@ body: | ; CHECK-LABEL: name: VLDA_VLDB ; CHECK: VST_dmw_sts_w_ag_idx_imm killed $wl0, $p0, 0 ; CHECK-NEXT: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0 { - ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 - ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -27,9 +27,10 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP VST_dmw_sts_w_ag_idx_imm $wl0, $p0, 0 - $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 - $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) ... + --- name: VLDA_VLDB_128 alignment: 16 @@ -38,8 +39,8 @@ body: | ; CHECK-LABEL: name: VLDA_VLDB_128 ; CHECK: VST_128_ag_idx_imm killed $wl0, $p0, 0 ; CHECK-NEXT: BUNDLE implicit-def $wl2, implicit-def $wl1, implicit killed $p0, implicit killed $dj0 { - ; CHECK-NEXT: $wl2 = VLDA_128 $p0 - ; CHECK-NEXT: $wl1 = VLDB_128_ag_idx killed $p0, killed $dj0 + ; CHECK-NEXT: $wl2 = VLDA_128 $p0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $wl1 = VLDB_128_ag_idx killed $p0, killed $dj0 :: (load (<8 x s32>), addrspace 5) ; CHECK-NEXT: } ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -48,6 +49,43 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP VST_128_ag_idx_imm $wl0, $p0, 0 - $wl1 = VLDB_128_ag_idx $p0, $dj0 - $wl2 = VLDA_128 $p0 + $wl1 = VLDB_128_ag_idx $p0, $dj0 :: (load (<8 x s32>), addrspace 5) + $wl2 = VLDA_128 $p0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_with_MMO_withOut_Addrspace +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_with_MMO_withOut_Addrspace + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>)) + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>)) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>)) + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>)) +... + +# We expect all the load/store to be with MMOs. +--- +name: VLDA_VLDB_withOut_MMO +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_withOut_MMO + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 ... diff --git a/llvm/unittests/Target/AIE/HazardRecognizerTest.cpp b/llvm/unittests/Target/AIE/HazardRecognizerTest.cpp index 4e9716a0294f..cbb7e145fa5c 100644 --- a/llvm/unittests/Target/AIE/HazardRecognizerTest.cpp +++ b/llvm/unittests/Target/AIE/HazardRecognizerTest.cpp @@ -97,13 +97,17 @@ class MockHR : public AIEHazardRecognizer { : AIEHazardRecognizer(&InstrInfo, &Itins, /*IsPreRA=*/false) { MockScoreboard.reset(computeScoreboardDepth()); } - void emit(unsigned SchedClass, int Delta, SlotBits SlotSet = 0) { - enterResources(MockScoreboard, &Itins, SchedClass, SlotSet, Delta, - std::nullopt); + void emit(unsigned SchedClass, int Delta, SlotBits SlotSet = 0, + MemoryBankBits MemoryBanks = 0, + SmallVector MemoryAccessCycles = {}) { + enterResources(MockScoreboard, &Itins, SchedClass, SlotSet, MemoryBanks, + MemoryAccessCycles, Delta, std::nullopt); } - bool hazard(unsigned SchedClass, int Delta, SlotBits SlotSet = 0) { - return checkConflict(MockScoreboard, &Itins, SchedClass, SlotSet, Delta, - std::nullopt); + bool hazard(unsigned SchedClass, int Delta, SlotBits SlotSet = 0, + MemoryBankBits MemoryBanks = 0, + SmallVector MemoryAccessCycles = {}) { + return checkConflict(MockScoreboard, &Itins, SchedClass, SlotSet, + MemoryBanks, MemoryAccessCycles, Delta, std::nullopt); } void AdvanceCycle() override { MockScoreboard.advance(); } void RecedeCycle() override { MockScoreboard.recede(); } @@ -114,6 +118,9 @@ class MockHR : public AIEHazardRecognizer { bool conflict(const MockHR &Other, int DeltaCycles) const { return MockScoreboard.conflict(Other.MockScoreboard, DeltaCycles); } + void blockResources(int DeltaCycles) { + AIEHazardRecognizer::blockCycleInScoreboard(DeltaCycles); + } }; TEST(HazardRecognizer, empty) { @@ -375,3 +382,121 @@ TEST(HazardRecognizer, composeConflicting) { EXPECT_EQ(HR.hazard(9, C), C == -1 || C == 0); } } + +/// Check scoreboard conflicts from bank utilization +TEST(HazardRecognizer, bankConflictHazard) { + AIE2InstrInfo InstrInfo; + MockHR HR(InstrInfo); + + HR.emit(1, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1010, + /*MemoryAccessCycle=*/{5}); + + // Classes 1 and 3 have no resource conflicts in MockStages, they can only + // conflict because of Memory Banks. + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b01, + /*MemoryAccessCycle=*/{5})); + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b100, + /*MemoryAccessCycle=*/{5})); + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b0101, + /*MemoryAccessCycle=*/{5})); + + // Expected to conflict since same bank & same memory access cycle + EXPECT_TRUE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b010, + /*MemoryAccessCycle=*/{5})); + EXPECT_TRUE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1000, + /*MemoryAccessCycle=*/{5})); + EXPECT_TRUE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1010, + /*MemoryAccessCycle=*/{5})); + EXPECT_TRUE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1111, + /*MemoryAccessCycle=*/{5})); + + // Not Expected to conflict since same bank but differenec memory access cycle + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b010, + /*MemoryAccessCycle=*/{6})); + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1000, + /*MemoryAccessCycle=*/{6})); + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1010, + /*MemoryAccessCycle=*/{6})); + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1111, + /*MemoryAccessCycle=*/{6})); +} + +/// Check scoreboard conflicts from bank utilization in multiple cycles +TEST(HazardRecognizer, bankConflictHazardMultiCycle) { + AIE2InstrInfo InstrInfo; + MockHR HR(InstrInfo); + + HR.emit(1, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1010, + /*MemoryAccessCycle=*/{5, 7}); + + // Classes 1 and 3 have no resource conflicts in MockStages, they can only + // conflict because of Memory Banks. + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b01, + /*MemoryAccessCycle=*/{5})); + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b100, + /*MemoryAccessCycle=*/{5})); + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b0101, + /*MemoryAccessCycle=*/{5})); + + // Expected to conflict since same bank & same memory access cycle + EXPECT_TRUE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b010, + /*MemoryAccessCycle=*/{5})); + EXPECT_TRUE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1000, + /*MemoryAccessCycle=*/{5})); + EXPECT_TRUE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1010, + /*MemoryAccessCycle=*/{5})); + EXPECT_TRUE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1111, + /*MemoryAccessCycle=*/{5})); + + // Not Expected to conflict since same bank but differenec memory access cycle + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b010, + /*MemoryAccessCycle=*/{6})); + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1000, + /*MemoryAccessCycle=*/{6})); + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1010, + /*MemoryAccessCycle=*/{6})); + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1111, + /*MemoryAccessCycle=*/{6})); + + // Not Expected to conflict since different bank but same memory access cycles + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b01, + /*MemoryAccessCycle=*/{5, 7})); + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b100, + /*MemoryAccessCycle=*/{5, 7})); + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b0101, + /*MemoryAccessCycle=*/{5, 7})); + + // Expected to conflict since same bank & same memory access cycles + EXPECT_TRUE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b010, + /*MemoryAccessCycle=*/{5, 7})); + EXPECT_TRUE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1000, + /*MemoryAccessCycle=*/{5, 11})); + EXPECT_TRUE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1010, + /*MemoryAccessCycle=*/{7, 11})); + EXPECT_TRUE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1111, + /*MemoryAccessCycle=*/{5, 7})); + + // Not Expected to conflict since same bank but differenec memory access cycle + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b010, + /*MemoryAccessCycle=*/{1, 6})); + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1000, + /*MemoryAccessCycle=*/{2, 8, 11})); + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1010, + /*MemoryAccessCycle=*/{1, 6, 11})); + EXPECT_FALSE(HR.hazard(3, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b1111, + /*MemoryAccessCycle=*/{6, 11})); +} + +/// Check scoreboard to show blockResources does not touch MemoryBanks +TEST(HazardRecognizer, blockResourcesMemoryBanks) { + AIE2InstrInfo InstrInfo; + MockHR HR(InstrInfo); + + HR.blockResources(0); + + // Not Expected to conflict since blockResources does not touch MemoryBanks + EXPECT_FALSE(HR.hazard(0, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b010, + /*MemoryAccessCycle=*/{1, 6})); + EXPECT_FALSE(HR.hazard(0, 0, /*SlotSet=*/0b0, /*MemoryBanks=*/0b010, + /*MemoryAccessCycle=*/{5})); +} \ No newline at end of file From d7802c7a8a53c208ab991ce899b735260916daae Mon Sep 17 00:00:00 2001 From: Krishnam Tibrewala Date: Fri, 9 Aug 2024 10:46:05 -0700 Subject: [PATCH 20/31] [AIE2] Add stack addrspace annotation to Load/Store to stack The work expect --aie-stack-addrspace passed to describe which AS stack is allocated --- llvm/lib/Target/AIE/AIE2TargetMachine.cpp | 16 ++ llvm/lib/Target/AIE/AIE2TargetMachine.h | 1 + .../AIE/aie2/schedule/vld_vld_spill.mir | 138 ++++++++++++++++++ 3 files changed, 155 insertions(+) create mode 100644 llvm/test/CodeGen/AIE/aie2/schedule/vld_vld_spill.mir diff --git a/llvm/lib/Target/AIE/AIE2TargetMachine.cpp b/llvm/lib/Target/AIE/AIE2TargetMachine.cpp index b3bfd80b4833..61d966c0d05f 100644 --- a/llvm/lib/Target/AIE/AIE2TargetMachine.cpp +++ b/llvm/lib/Target/AIE/AIE2TargetMachine.cpp @@ -60,6 +60,11 @@ static cl::opt EnablePreMISchedCoalescer( "aie-premisched-coalescer", cl::Hidden, cl::init(true), cl::desc("Run the coalescer again after the pre-RA scheduler")); +static cl::opt StackAddrSpace( + "aie-stack-addrspace", cl::init(0), + cl::desc("Specify the addrspace where the stack is allocated " + "(5: Bank A, 6: Bank B, 7: Bank C, 8: Bank D)")); + extern bool AIEDumpArtifacts; void AIE2TargetMachine::anchor() {} @@ -259,3 +264,14 @@ bool AIE2PassConfig::addInstSelector() { addPass(createMachineFunctionDumperPass(/*Suffix=*/"after-isel")); return false; } + +unsigned +AIE2TargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { + switch (Kind) { + case PseudoSourceValue::Stack: + case PseudoSourceValue::FixedStack: + return StackAddrSpace; + default: + return static_cast(AIE2::AddressSpaces::none); + } +} diff --git a/llvm/lib/Target/AIE/AIE2TargetMachine.h b/llvm/lib/Target/AIE/AIE2TargetMachine.h index 8a2c2a94cbf4..70f0bca12ff4 100644 --- a/llvm/lib/Target/AIE/AIE2TargetMachine.h +++ b/llvm/lib/Target/AIE/AIE2TargetMachine.h @@ -43,6 +43,7 @@ class AIE2TargetMachine : public AIEBaseTargetMachine { TargetPassConfig *createPassConfig(PassManagerBase &PM) override; /// PostRAScheduling is scheduled as part of PreSched2 passes. bool targetSchedulesPostRAScheduling() const override { return true; } + unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override; }; } // namespace llvm diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/vld_vld_spill.mir b/llvm/test/CodeGen/AIE/aie2/schedule/vld_vld_spill.mir new file mode 100644 index 000000000000..0b40045d3a7f --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/vld_vld_spill.mir @@ -0,0 +1,138 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -march=aie2 -run-pass=postmisched %topdown-multi --aie-stack-addrspace=5 %s -o - | FileCheck -check-prefixes=STACK-BANK-A %s +# RUN: llc -march=aie2 -run-pass=postmisched %topdown-multi --aie-stack-addrspace=6 %s -o - | FileCheck -check-prefixes=STACK-BANK-B %s + +# This test checks scheduling of LD_SPILL / VLD_SPILL or VLDB + +--- +name: VLDA_Spill_VLD_1 +alignment: 16 +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 32, alignment: 32 } +body: | + bb.0.entry: + ; STACK-BANK-A-LABEL: name: VLDA_Spill_VLD_1 + ; STACK-BANK-A: $wl1 = VLDA_dmw_lda_w_ag_spill -96, implicit $sp :: (load (s256) from %stack.0, addrspace 5) + ; STACK-BANK-A-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; + ; STACK-BANK-B-LABEL: name: VLDA_Spill_VLD_1 + ; STACK-BANK-B: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit $sp, implicit killed $p0 { + ; STACK-BANK-B-NEXT: $wl1 = VLDA_dmw_lda_w_ag_spill -96, implicit $sp :: (load (s256) from %stack.0, addrspace 6) + ; STACK-BANK-B-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; STACK-BANK-B-NEXT: } + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + $wl1 = VLDA_dmw_lda_w_ag_spill -96, implicit $sp :: (load (s256) from %stack.0) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_Spill_VLD_2 +alignment: 16 +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 32, alignment: 32 } +body: | + bb.0.entry: + ; STACK-BANK-A-LABEL: name: VLDA_Spill_VLD_2 + ; STACK-BANK-A: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit $sp, implicit killed $p0 { + ; STACK-BANK-A-NEXT: $wl1 = VLDA_dmw_lda_w_ag_spill -96, implicit $sp :: (load (s256) from %stack.0, addrspace 5) + ; STACK-BANK-A-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; STACK-BANK-A-NEXT: } + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; + ; STACK-BANK-B-LABEL: name: VLDA_Spill_VLD_2 + ; STACK-BANK-B: $wl1 = VLDA_dmw_lda_w_ag_spill -96, implicit $sp :: (load (s256) from %stack.0, addrspace 6) + ; STACK-BANK-B-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + $wl1 = VLDA_dmw_lda_w_ag_spill -96, implicit $sp :: (load (s256) from %stack.0) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: LDA_Spill_VLD_1 +alignment: 16 +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 32, alignment: 32 } +body: | + bb.0.entry: + ; STACK-BANK-A-LABEL: name: LDA_Spill_VLD_1 + ; STACK-BANK-A: $r0 = LDA_dms_spill -28, implicit $sp :: (load (s32) from %stack.0, addrspace 5) + ; STACK-BANK-A-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; + ; STACK-BANK-B-LABEL: name: LDA_Spill_VLD_1 + ; STACK-BANK-B: BUNDLE implicit-def $r0, implicit-def $wl2, implicit $sp, implicit killed $p0 { + ; STACK-BANK-B-NEXT: $r0 = LDA_dms_spill -28, implicit $sp :: (load (s32) from %stack.0, addrspace 6) + ; STACK-BANK-B-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; STACK-BANK-B-NEXT: } + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + $r0 = LDA_dms_spill -28, implicit $sp :: (load (s32) from %stack.0) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: LDA_Spill_VLD_2 +alignment: 16 +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 32, alignment: 32 } +body: | + bb.0.entry: + ; STACK-BANK-A-LABEL: name: LDA_Spill_VLD_2 + ; STACK-BANK-A: BUNDLE implicit-def $r0, implicit-def $wl2, implicit $sp, implicit killed $p0 { + ; STACK-BANK-A-NEXT: $r0 = LDA_dms_spill -28, implicit $sp :: (load (s32) from %stack.0, addrspace 5) + ; STACK-BANK-A-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; STACK-BANK-A-NEXT: } + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; STACK-BANK-A-NEXT: NOP + ; + ; STACK-BANK-B-LABEL: name: LDA_Spill_VLD_2 + ; STACK-BANK-B: $r0 = LDA_dms_spill -28, implicit $sp :: (load (s32) from %stack.0, addrspace 6) + ; STACK-BANK-B-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + ; STACK-BANK-B-NEXT: NOP + $r0 = LDA_dms_spill -28, implicit $sp :: (load (s32) from %stack.0) + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) +... From 88f3fe9ee88e85b86d0f2ee9a49d70bd808e7b67 Mon Sep 17 00:00:00 2001 From: Krishnam Tibrewala Date: Fri, 9 Aug 2024 11:17:33 -0700 Subject: [PATCH 21/31] [AIE2] Annotate Tile Memory AS --- llvm/lib/Target/AIE/AIE2AddrSpace.h | 2 +- llvm/lib/Target/AIE/AIE2Subtarget.cpp | 3 + llvm/lib/Target/AIE/AIE2TargetMachine.cpp | 3 + .../GlobalISel/inst-select-store_load_TM.mir | 4 +- llvm/test/CodeGen/AIE/aie2/schedule/load.mir | 2 +- .../AIE/aie2/schedule/memory_access_DM_TM.mir | 110 ++++++++++++++++++ .../AIE/aie2/schedule/pre_ra/load_storetm.mir | 4 +- .../AIE/aie2/schedule/resource/proc_bus.mir | 4 +- llvm/test/CodeGen/AIE/aie2/schedule/store.mir | 2 +- llvm/test/CodeGen/AIE/aie2/schedule/tmdep.mir | 32 ++--- 10 files changed, 141 insertions(+), 25 deletions(-) create mode 100644 llvm/test/CodeGen/AIE/aie2/schedule/memory_access_DM_TM.mir diff --git a/llvm/lib/Target/AIE/AIE2AddrSpace.h b/llvm/lib/Target/AIE/AIE2AddrSpace.h index 70e40c372757..3229b0b94236 100644 --- a/llvm/lib/Target/AIE/AIE2AddrSpace.h +++ b/llvm/lib/Target/AIE/AIE2AddrSpace.h @@ -38,7 +38,7 @@ enum class AddressSpaces { TM // Address space for TM (Tile Memory) }; -enum class AIEBanks { A, B, C, D }; +enum class AIEBanks { A, B, C, D, TileMemory }; } // end namespace AIE2 } // end namespace llvm diff --git a/llvm/lib/Target/AIE/AIE2Subtarget.cpp b/llvm/lib/Target/AIE/AIE2Subtarget.cpp index 72a880de19e0..10c7f3e9bbd7 100644 --- a/llvm/lib/Target/AIE/AIE2Subtarget.cpp +++ b/llvm/lib/Target/AIE/AIE2Subtarget.cpp @@ -129,6 +129,9 @@ AIE2Subtarget::getMemoryBanksFromAddressSpace(unsigned AddrSpace) const { MemoryBanks.set(static_cast(AIEBanks::C)) .set(static_cast(AIEBanks::D)); break; + case AddressSpaces::TM: + MemoryBanks.set(static_cast(AIEBanks::TileMemory)); + break; default: return getDefaultMemoryBank(); break; diff --git a/llvm/lib/Target/AIE/AIE2TargetMachine.cpp b/llvm/lib/Target/AIE/AIE2TargetMachine.cpp index 61d966c0d05f..6f0ab31c03c8 100644 --- a/llvm/lib/Target/AIE/AIE2TargetMachine.cpp +++ b/llvm/lib/Target/AIE/AIE2TargetMachine.cpp @@ -21,6 +21,7 @@ #include "AIEFinalizeBundle.h" #include "AIEMachineAlignment.h" #include "AIEMachineBlockPlacement.h" +#include "AIEMachineFunctionInfo.h" #include "AIETargetObjectFile.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" @@ -271,6 +272,8 @@ AIE2TargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { case PseudoSourceValue::Stack: case PseudoSourceValue::FixedStack: return StackAddrSpace; + case AIETargetPSV::AIETileMem: + return static_cast(AIE2::AddressSpaces::TM); default: return static_cast(AIE2::AddressSpaces::none); } diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-store_load_TM.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-store_load_TM.mir index 3af3d13af3ef..37442b043960 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-store_load_TM.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-store_load_TM.mir @@ -18,7 +18,7 @@ body: | ; CHECK-LABEL: name: Read_TM ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[LDA_TM:%[0-9]+]]:er = LDA_TM %2:ep :: (load (s32) from custom "TileMemory") + ; CHECK-NEXT: [[LDA_TM:%[0-9]+]]:er = LDA_TM %2:ep :: (load (s32) from custom "TileMemory", addrspace 15) ; CHECK-NEXT: $r0 = COPY [[LDA_TM]] ; CHECK-NEXT: PseudoRET implicit $lr, implicit $r0 %1:ptrregbank(p0) = COPY $p0 @@ -39,7 +39,7 @@ body: | ; CHECK-LABEL: name: Write_TM ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: ST_TM %1:er, %2:ep :: (store (s32) into custom "TileMemory") + ; CHECK-NEXT: ST_TM %1:er, %2:ep :: (store (s32) into custom "TileMemory", addrspace 15) ; CHECK-NEXT: PseudoRET implicit $lr %1:ptrregbank(p0) = COPY $p0 G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2.write.tm), %0:gprregbank(s32), %5:ptrregbank(p0) diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/load.mir b/llvm/test/CodeGen/AIE/aie2/schedule/load.mir index b70f8209c688..782e27d33414 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/load.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/load.mir @@ -39,7 +39,7 @@ body: | ; CHECK-LABEL: name: load_tm ; CHECK: liveins: $p1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $r6 = LDA_TM killed $p1 :: (load (s32) from custom "TileMemory") + ; CHECK-NEXT: $r6 = LDA_TM killed $p1 :: (load (s32) from custom "TileMemory", addrspace 15) ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/memory_access_DM_TM.mir b/llvm/test/CodeGen/AIE/aie2/schedule/memory_access_DM_TM.mir new file mode 100644 index 000000000000..af5d527f5398 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/memory_access_DM_TM.mir @@ -0,0 +1,110 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -march=aie2 -run-pass=postmisched %topdown-multi %s -o - | FileCheck %s + +# This test checks scheduling of LDA_TM with VLDB + +# We expect LDA.TM & VLDB to be scheduled in the same bundle when VLDB does not have MMO +--- +name: LDA_TM_VLDB_without_AS +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: LDA_TM_VLDB_without_AS + ; CHECK: BUNDLE implicit-def $r0, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $r0 = LDA_TM $p0 :: (load (s32) from custom "TileMemory", addrspace 15) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>)) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $r0 = LDA_TM $p0 :: (load (s32) from custom "TileMemory") + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>)) +... + +--- +name: LDA_TM_VLDB_from_BankA +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: LDA_TM_VLDB_from_BankA + ; CHECK: BUNDLE implicit-def $r0, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $r0 = LDA_TM $p0 :: (load (s32) from custom "TileMemory", addrspace 15) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $r0 = LDA_TM $p0 :: (load (s32) from custom "TileMemory") + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: LDA_TM_VLDB_from_BankB +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: LDA_TM_VLDB_from_BankB + ; CHECK: BUNDLE implicit-def $r0, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $r0 = LDA_TM $p0 :: (load (s32) from custom "TileMemory", addrspace 15) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $r0 = LDA_TM $p0 :: (load (s32) from custom "TileMemory") + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: LDA_TM_VLDB_from_BankC +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: LDA_TM_VLDB_from_BankC + ; CHECK: BUNDLE implicit-def $r0, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $r0 = LDA_TM $p0 :: (load (s32) from custom "TileMemory", addrspace 15) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $r0 = LDA_TM $p0 :: (load (s32) from custom "TileMemory") + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: LDA_TM_VLDB_from_BankD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: LDA_TM_VLDB_from_BankD + ; CHECK: BUNDLE implicit-def $r0, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $r0 = LDA_TM $p0 :: (load (s32) from custom "TileMemory", addrspace 15) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $r0 = LDA_TM $p0 :: (load (s32) from custom "TileMemory") + $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/load_storetm.mir b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/load_storetm.mir index 1b651ae652bb..0db72c6e8f72 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/load_storetm.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/pre_ra/load_storetm.mir @@ -30,9 +30,9 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY [[COPY]] ; CHECK-NEXT: [[LDA_dms_lda_idx_imm1:%[0-9]+]]:er = LDA_dms_lda_idx_imm [[COPY1]], 0 :: (load (s32) from %ir.p0) ; CHECK-NEXT: [[MOVXM_lng_cg:%[0-9]+]]:ep_as_32bit = MOVXM_lng_cg -524284 - ; CHECK-NEXT: ST_TM [[LDA_dms_lda_idx_imm]], [[MOVXM_lng_cg]] :: (store (s32) into custom "TileMemory") + ; CHECK-NEXT: ST_TM [[LDA_dms_lda_idx_imm]], [[MOVXM_lng_cg]] :: (store (s32) into custom "TileMemory", addrspace 15) ; CHECK-NEXT: [[MOVXM_lng_cg1:%[0-9]+]]:ep_as_32bit = MOVXM_lng_cg -524280 - ; CHECK-NEXT: ST_TM [[LDA_dms_lda_idx_imm1]], [[MOVXM_lng_cg1]] :: (store (s32) into custom "TileMemory") + ; CHECK-NEXT: ST_TM [[LDA_dms_lda_idx_imm1]], [[MOVXM_lng_cg1]] :: (store (s32) into custom "TileMemory", addrspace 15) %100:ep = COPY $p0 %200:er = LDA_dms_lda_idx_imm %100, 0 :: (load (s32) from %ir.p0) %300:ep_as_32bit = MOVXM_lng_cg -524284 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/resource/proc_bus.mir b/llvm/test/CodeGen/AIE/aie2/schedule/resource/proc_bus.mir index 572d0edd7518..5aea7fa0220a 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/resource/proc_bus.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/resource/proc_bus.mir @@ -23,8 +23,8 @@ body: | ; CHECK-NEXT: $p0 = MOV_mv_scl killed $r1 ; CHECK-NEXT: } ; CHECK-NEXT: $p1 = MOV_mv_scl killed $r4 - ; CHECK-NEXT: ST_TM killed $r0, killed $p0 :: (store (s32) into custom "TileMemory") - ; CHECK-NEXT: $r0 = LDA_TM killed $p1 :: (load (s32) from custom "TileMemory") + ; CHECK-NEXT: ST_TM killed $r0, killed $p0 :: (store (s32) into custom "TileMemory", addrspace 15) + ; CHECK-NEXT: $r0 = LDA_TM killed $p1 :: (load (s32) from custom "TileMemory", addrspace 15) ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/store.mir b/llvm/test/CodeGen/AIE/aie2/schedule/store.mir index 4e0977ad0221..1fae92ab88d9 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/store.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/store.mir @@ -40,7 +40,7 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP - ; CHECK-NEXT: ST_TM killed $r6, killed $p0 :: (store (s32) into custom "TileMemory") + ; CHECK-NEXT: ST_TM killed $r6, killed $p0 :: (store (s32) into custom "TileMemory", addrspace 15) ; CHECK-NEXT: NOP $r6 = LDA_dms_lda_idx_imm $p1, 4 ST_TM $r6, $p0 :: (store (s32) into custom "TileMemory") diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/tmdep.mir b/llvm/test/CodeGen/AIE/aie2/schedule/tmdep.mir index 747935d9de5c..e968c43de622 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/tmdep.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/tmdep.mir @@ -75,11 +75,11 @@ body: | liveins: $p0 ; CHECK-LABEL: name: nodep_wtm_rmem ; CHECK: LDA_dms_lda_idx_imm killed renamable $p0, 0 :: (load (s32) from %ir.p) - ; CHECK: ST_TM killed renamable $r1, killed renamable $p1 :: (store (s32) into custom "TileMemory") + ; CHECK: ST_TM killed renamable $r1, killed renamable $p1 :: (store (s32) into custom "TileMemory", addrspace 15) renamable $r1 = MOVA_lda_cg 42 renamable $p1 = MOVXM_lng_cg -524284 - ST_TM killed renamable $r1, killed renamable $p1 :: (store (s32) into custom "TileMemory") + ST_TM killed renamable $r1, killed renamable $p1 :: (store (s32) into custom "TileMemory", addrspace 15) renamable $r0 = LDA_dms_lda_idx_imm killed renamable $p0, 0 :: (load (s32) from %ir.p) PseudoRET implicit $lr, implicit $r0 @@ -99,10 +99,10 @@ body: | ; CHECK-LABEL: name: nodep_rtm_wmem ; CHECK: ST_dms_sts_idx_imm killed renamable $r1, killed renamable $p0, 0 :: (store (s32) into %ir.p) - ; CHECK: renamable $r0 = LDA_TM killed renamable $p1 :: (load (s32) from custom "TileMemory") + ; CHECK: renamable $r0 = LDA_TM killed renamable $p1 :: (load (s32) from custom "TileMemory", addrspace 15) renamable $p1 = MOVXM_lng_cg -524282 ST_dms_sts_idx_imm killed renamable $r1, killed renamable $p0, 0 :: (store (s32) into %ir.p) - renamable $r0 = LDA_TM killed renamable $p1 :: (load (s32) from custom "TileMemory") + renamable $r0 = LDA_TM killed renamable $p1 :: (load (s32) from custom "TileMemory", addrspace 15) ... --- @@ -118,12 +118,12 @@ body: | bb.0.entry: liveins: $p1 ; CHECK-LABEL: name: true_tm - ; CHECK: ST_TM killed renamable $r1, killed renamable $p0 :: (store (s32) into custom "TileMemory") - ; CHECK: renamable $r0 = LDA_TM killed renamable $p1 :: (load (s32) from custom "TileMemory") + ; CHECK: ST_TM killed renamable $r1, killed renamable $p0 :: (store (s32) into custom "TileMemory", addrspace 15) + ; CHECK: renamable $r0 = LDA_TM killed renamable $p1 :: (load (s32) from custom "TileMemory", addrspace 15) renamable $r1 = MOVA_lda_cg 42 renamable $p0 = MOVXM_lng_cg -524283 - ST_TM killed renamable $r1, renamable $p0 :: (store (s32) into custom "TileMemory") - renamable $r0 = LDA_TM killed renamable $p1 :: (load (s32) from custom "TileMemory") + ST_TM killed renamable $r1, renamable $p0 :: (store (s32) into custom "TileMemory", addrspace 15) + renamable $r0 = LDA_TM killed renamable $p1 :: (load (s32) from custom "TileMemory", addrspace 15) ... --- @@ -139,11 +139,11 @@ body: | bb.0.entry: liveins: $p0, $r1 ; CHECK-LABEL: name: anti_tm - ; CHECK: renamable $r0 = LDA_TM renamable $p0 :: (load (s32) from custom "TileMemory") - ; CHECK: ST_TM killed renamable $r1, killed renamable $p0 :: (store (s32) into custom "TileMemory") - renamable $r0 = LDA_TM renamable $p0 :: (load (s32) from custom "TileMemory") + ; CHECK: renamable $r0 = LDA_TM renamable $p0 :: (load (s32) from custom "TileMemory", addrspace 15) + ; CHECK: ST_TM killed renamable $r1, killed renamable $p0 :: (store (s32) into custom "TileMemory", addrspace 15) + renamable $r0 = LDA_TM renamable $p0 :: (load (s32) from custom "TileMemory", addrspace 15) renamable $r1 = MOVA_lda_cg 42 - ST_TM killed renamable $r1, renamable $p0 :: (store (s32) into custom "TileMemory") + ST_TM killed renamable $r1, renamable $p0 :: (store (s32) into custom "TileMemory", addrspace 15) ... --- @@ -160,14 +160,14 @@ body: | liveins: $r0, $r1 ; CHECK-LABEL: name: out_tm - ; CHECK: ST_TM killed renamable $r3, killed renamable $p0 :: (store (s32) into custom "TileMemory") - ; CHECK: ST_TM killed renamable $r0, killed renamable $p0 :: (store (s32) into custom "TileMemory") + ; CHECK: ST_TM killed renamable $r3, killed renamable $p0 :: (store (s32) into custom "TileMemory", addrspace 15) + ; CHECK: ST_TM killed renamable $r0, killed renamable $p0 :: (store (s32) into custom "TileMemory", addrspace 15) renamable $p0 = MOVXM_lng_cg -524283 renamable $r2 = MOVXM_lng_cg 524288 renamable $r3 = nsw ADD_add_r_ri renamable $r0, 13, implicit-def $srcarry - ST_TM killed renamable $r3, killed renamable $p0 :: (store (s32) into custom "TileMemory") + ST_TM killed renamable $r3, killed renamable $p0 :: (store (s32) into custom "TileMemory", addrspace 15) renamable $r1 = ADD killed renamable $r1, killed renamable $r2, implicit-def $srcarry $p0 = MOV_mv_scl killed $r1 - ST_TM killed renamable $r0, killed renamable $p0 :: (store (s32) into custom "TileMemory") + ST_TM killed renamable $r0, killed renamable $p0 :: (store (s32) into custom "TileMemory", addrspace 15) ... From eaf8501a9778cb9a7d52ef8e26bc634e1f6ab9d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Fri, 9 Aug 2024 14:47:10 +0100 Subject: [PATCH 22/31] [AIEX] Disable tail merging This is really bad for AIE and its 5 delay slots after branches. We almost never want to insert a new branch to share common instructions, especially in a loop. This had a very bad habit of destroying our 3-stage SW pipelines. :) --- llvm/lib/Target/AIE/AIETargetMachine.cpp | 10 ++ llvm/lib/Target/AIE/AIETargetMachine.h | 3 +- llvm/test/CodeGen/AIE/aie1/analyze-branch.ll | 34 +++-- .../CodeGen/AIE/aie2/intrinsics-shufflevec.ll | 81 +++++++---- llvm/test/CodeGen/AIE/aie2/no-tail-merge.mir | 134 ++++++++++++++++++ .../AIE/aie2/ra/tie-subregs-flow-3d.mir | 10 +- .../CodeGen/AIE/aie2/ra/tie-subregs-flow.mir | 23 ++- 7 files changed, 235 insertions(+), 60 deletions(-) create mode 100644 llvm/test/CodeGen/AIE/aie2/no-tail-merge.mir diff --git a/llvm/lib/Target/AIE/AIETargetMachine.cpp b/llvm/lib/Target/AIE/AIETargetMachine.cpp index 3d077a5faa49..16cd5f621d34 100644 --- a/llvm/lib/Target/AIE/AIETargetMachine.cpp +++ b/llvm/lib/Target/AIE/AIETargetMachine.cpp @@ -54,6 +54,11 @@ static cl::opt cl::desc("Enable AIE alias analysis pass"), cl::init(true), cl::Hidden); +static cl::opt + EnableTailMergingOpt("aie-enable-tail-merge", + cl::desc("Enable tail merging for AIE."), + cl::init(false), cl::Hidden); + // Option to run internalize pass. static cl::opt InternalizeSymbols( "aie-internalize-symbols", @@ -143,6 +148,11 @@ TargetPassConfig *AIETargetMachine::createPassConfig(PassManagerBase &PM) { return new AIEPassConfig(*this, PM); } +AIEPassConfig::AIEPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) { + EnableTailMerge = EnableTailMergingOpt; +} + void AIEPassConfig::addIRPasses() { // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg // ourselves. diff --git a/llvm/lib/Target/AIE/AIETargetMachine.h b/llvm/lib/Target/AIE/AIETargetMachine.h index 0d1b170c6acc..096bc0986b26 100644 --- a/llvm/lib/Target/AIE/AIETargetMachine.h +++ b/llvm/lib/Target/AIE/AIETargetMachine.h @@ -81,8 +81,7 @@ class AIETargetMachine : public AIEBaseTargetMachine { class AIEPassConfig : public TargetPassConfig { public: - AIEPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} + AIEPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM); AIETargetMachine &getAIETargetMachine() const { return getTM(); diff --git a/llvm/test/CodeGen/AIE/aie1/analyze-branch.ll b/llvm/test/CodeGen/AIE/aie1/analyze-branch.ll index 98da0400b649..9ea8d5e0c061 100644 --- a/llvm/test/CodeGen/AIE/aie1/analyze-branch.ll +++ b/llvm/test/CodeGen/AIE/aie1/analyze-branch.ll @@ -30,7 +30,7 @@ define void @test_bcc_fallthrough_taken(i32 %in) nounwind { ; OPT-NEXT: padda [sp], #32 ; OPT-NEXT: ne r12, r6, r12 ; OPT-NEXT: st.spil lr, [sp, #-32] // 4-byte Folded Spill -; OPT-NEXT: bnez r12, .LBB0_3 +; OPT-NEXT: bnez r12, .LBB0_2 ; OPT-NEXT: nop // Delay Slot 5 ; OPT-NEXT: nop // Delay Slot 4 ; OPT-NEXT: nop // Delay Slot 3 @@ -44,9 +44,6 @@ define void @test_bcc_fallthrough_taken(i32 %in) nounwind { ; OPT-NEXT: nop // Delay Slot 3 ; OPT-NEXT: nop // Delay Slot 2 ; OPT-NEXT: nop // Delay Slot 1 -; OPT-NEXT: .p2align 4 -; OPT-NEXT: .LBB0_2: // %true -; OPT-NEXT: // Label of block must be emitted ; OPT-NEXT: ldb lr, [sp, #-32] // 4-byte Folded Reload ; OPT-NEXT: padda [sp], #-32 ; OPT-NEXT: nop @@ -62,7 +59,7 @@ define void @test_bcc_fallthrough_taken(i32 %in) nounwind { ; OPT-NEXT: nop // Delay Slot 2 ; OPT-NEXT: nop // Delay Slot 1 ; OPT-NEXT: .p2align 4 -; OPT-NEXT: .LBB0_3: // %false +; OPT-NEXT: .LBB0_2: // %false ; OPT-NEXT: // Label of block must be emitted ; OPT-NEXT: jal test_false ; OPT-NEXT: nop // Delay Slot 5 @@ -70,7 +67,15 @@ define void @test_bcc_fallthrough_taken(i32 %in) nounwind { ; OPT-NEXT: nop // Delay Slot 3 ; OPT-NEXT: nop // Delay Slot 2 ; OPT-NEXT: nop // Delay Slot 1 -; OPT-NEXT: j .LBB0_2 +; OPT-NEXT: ldb lr, [sp, #-32] // 4-byte Folded Reload +; OPT-NEXT: padda [sp], #-32 +; OPT-NEXT: nop +; OPT-NEXT: nop +; OPT-NEXT: nop +; OPT-NEXT: nop +; OPT-NEXT: nop +; OPT-NEXT: nop +; OPT-NEXT: ret lr ; OPT-NEXT: nop // Delay Slot 5 ; OPT-NEXT: nop // Delay Slot 4 ; OPT-NEXT: nop // Delay Slot 3 @@ -159,7 +164,7 @@ define void @test_bcc_fallthrough_nottaken(i32 %in) nounwind { ; OPT-NEXT: padda [sp], #32 ; OPT-NEXT: ne r12, r6, r12 ; OPT-NEXT: st.spil lr, [sp, #-32] // 4-byte Folded Spill -; OPT-NEXT: beqz r12, .LBB1_3 +; OPT-NEXT: beqz r12, .LBB1_2 ; OPT-NEXT: nop // Delay Slot 5 ; OPT-NEXT: nop // Delay Slot 4 ; OPT-NEXT: nop // Delay Slot 3 @@ -173,9 +178,6 @@ define void @test_bcc_fallthrough_nottaken(i32 %in) nounwind { ; OPT-NEXT: nop // Delay Slot 3 ; OPT-NEXT: nop // Delay Slot 2 ; OPT-NEXT: nop // Delay Slot 1 -; OPT-NEXT: .p2align 4 -; OPT-NEXT: .LBB1_2: // %true -; OPT-NEXT: // Label of block must be emitted ; OPT-NEXT: ldb lr, [sp, #-32] // 4-byte Folded Reload ; OPT-NEXT: padda [sp], #-32 ; OPT-NEXT: nop @@ -191,7 +193,7 @@ define void @test_bcc_fallthrough_nottaken(i32 %in) nounwind { ; OPT-NEXT: nop // Delay Slot 2 ; OPT-NEXT: nop // Delay Slot 1 ; OPT-NEXT: .p2align 4 -; OPT-NEXT: .LBB1_3: // %true +; OPT-NEXT: .LBB1_2: // %true ; OPT-NEXT: // Label of block must be emitted ; OPT-NEXT: jal test_true ; OPT-NEXT: nop // Delay Slot 5 @@ -199,7 +201,15 @@ define void @test_bcc_fallthrough_nottaken(i32 %in) nounwind { ; OPT-NEXT: nop // Delay Slot 3 ; OPT-NEXT: nop // Delay Slot 2 ; OPT-NEXT: nop // Delay Slot 1 -; OPT-NEXT: j .LBB1_2 +; OPT-NEXT: ldb lr, [sp, #-32] // 4-byte Folded Reload +; OPT-NEXT: padda [sp], #-32 +; OPT-NEXT: nop +; OPT-NEXT: nop +; OPT-NEXT: nop +; OPT-NEXT: nop +; OPT-NEXT: nop +; OPT-NEXT: nop +; OPT-NEXT: ret lr ; OPT-NEXT: nop // Delay Slot 5 ; OPT-NEXT: nop // Delay Slot 4 ; OPT-NEXT: nop // Delay Slot 3 diff --git a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll index f33c6811ccd6..118c3e1821e4 100644 --- a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll +++ b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll @@ -18,7 +18,7 @@ define <8 x i32> @test_extract_vector(<16 x i32> noundef %a, i32 noundef %idx) { ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: mov r8, r16 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.end -; CHECK-NEXT: mova r16, #8 +; CHECK-NEXT: mova r16, #8; nopb ; nopxm ; nops ; CHECK-NEXT: vextract.s32 r0, x2, r16 ; CHECK-NEXT: nop ; CHECK-NEXT: mova r16, #9 @@ -35,15 +35,25 @@ define <8 x i32> @test_extract_vector(<16 x i32> noundef %a, i32 noundef %idx) { ; CHECK-NEXT: nop ; CHECK-NEXT: mova r16, #13 ; CHECK-NEXT: vextract.s32 r5, x2, r16 -; CHECK-NEXT: j #.LBB0_3 -; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: mova r16, #15 // Delay Slot 4 -; CHECK-NEXT: vextract.s32 r6, x2, r16 // Delay Slot 3 -; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: mova r16, #14 // Delay Slot 1 +; CHECK-NEXT: nop +; CHECK-NEXT: mova r16, #15 +; CHECK-NEXT: vextract.s32 r6, x2, r16 +; CHECK-NEXT: nop +; CHECK-NEXT: mova r16, #14 +; CHECK-NEXT: vextract.s32 r7, x2, r16 +; CHECK-NEXT: vpush.lo.32 x0, r6, x0 +; CHECK-NEXT: vpush.lo.32 x0, r7, x0 +; CHECK-NEXT: vpush.lo.32 x0, r5, x0 +; CHECK-NEXT: vpush.lo.32 x0, r4, x0 +; CHECK-NEXT: ret lr +; CHECK-NEXT: vpush.lo.32 x0, r3, x0 // Delay Slot 5 +; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 4 +; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 3 +; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 +; CHECK-NEXT: mov r16, r8 // Delay Slot 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %if.then -; CHECK-NEXT: mova r16, #0; nopxm +; CHECK-NEXT: mova r16, #0; nopb ; nopxm ; nops ; CHECK-NEXT: vextract.s32 r0, x2, r16 ; CHECK-NEXT: nop ; CHECK-NEXT: mova r16, #1 @@ -65,9 +75,7 @@ define <8 x i32> @test_extract_vector(<16 x i32> noundef %a, i32 noundef %idx) { ; CHECK-NEXT: vextract.s32 r6, x2, r16 ; CHECK-NEXT: nop ; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB0_3: // %return -; CHECK-NEXT: nopx ; vextract.s32 r7, x2, r16 +; CHECK-NEXT: vextract.s32 r7, x2, r16 ; CHECK-NEXT: vpush.lo.32 x0, r6, x0 ; CHECK-NEXT: vpush.lo.32 x0, r7, x0 ; CHECK-NEXT: vpush.lo.32 x0, r5, x0 @@ -133,7 +141,7 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, < ; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.end ; CHECK-NEXT: nopb ; mova r16, #3; nops ; nopxm ; nopv -; CHECK-NEXT: vextract.s32 r0, x2, r19 +; CHECK-NEXT: nopa ; vextract.s32 r0, x2, r19 ; CHECK-NEXT: vextract.s32 r1, x0, r19 ; CHECK-NEXT: vextract.s32 r2, x2, r18 ; CHECK-NEXT: vextract.s32 r3, x0, r18 @@ -153,15 +161,30 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, < ; CHECK-NEXT: mova r16, #7 ; CHECK-NEXT: vextract.s32 r12, x2, r16 ; CHECK-NEXT: vextract.s32 r13, x0, r16 +; CHECK-NEXT: nop +; CHECK-NEXT: mova r16, #6 +; CHECK-NEXT: vextract.s32 r14, x2, r16 +; CHECK-NEXT: vextract.s32 r15, x0, r16 +; CHECK-NEXT: vpush.lo.32 x0, r13, x0 +; CHECK-NEXT: vpush.lo.32 x0, r15, x0 +; CHECK-NEXT: vpush.lo.32 x0, r11, x0 +; CHECK-NEXT: vpush.lo.32 x0, r9, x0 +; CHECK-NEXT: vpush.lo.32 x0, r7, x0 +; CHECK-NEXT: vpush.lo.32 x0, r5, x0 +; CHECK-NEXT: vpush.lo.32 x0, r3, x0 +; CHECK-NEXT: vpush.lo.32 x0, r1, x0 +; CHECK-NEXT: vpush.lo.32 x0, r12, x0 +; CHECK-NEXT: vpush.lo.32 x0, r14, x0 +; CHECK-NEXT: vpush.lo.32 x0, r10, x0 ; CHECK-NEXT: j #.LBB1_3 -; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: mova r16, #6 // Delay Slot 4 -; CHECK-NEXT: vextract.s32 r14, x2, r16 // Delay Slot 3 -; CHECK-NEXT: vextract.s32 r15, x0, r16 // Delay Slot 2 -; CHECK-NEXT: nop // Delay Slot 1 +; CHECK-NEXT: vpush.lo.32 x0, r8, x0 // Delay Slot 5 +; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 4 +; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 3 +; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 2 +; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_2: // %if.then -; CHECK-NEXT: mova r16, #3; nopx +; CHECK-NEXT: mova r16, #3; nopb ; nopx ; CHECK-NEXT: vextract.s32 r0, x0, r19 ; CHECK-NEXT: vextract.s32 r1, x2, r19 ; CHECK-NEXT: vextract.s32 r2, x0, r18 @@ -186,12 +209,6 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, < ; CHECK-NEXT: mova r16, #6 ; CHECK-NEXT: vextract.s32 r14, x0, r16 ; CHECK-NEXT: vextract.s32 r15, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB1_3: // %cleanup -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov r19, r27; nopv -; CHECK-NEXT: mov r18, r26 -; CHECK-NEXT: mov r17, r25 ; CHECK-NEXT: vpush.lo.32 x0, r13, x0 ; CHECK-NEXT: vpush.lo.32 x0, r15, x0 ; CHECK-NEXT: vpush.lo.32 x0, r11, x0 @@ -204,11 +221,17 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, < ; CHECK-NEXT: vpush.lo.32 x0, r14, x0 ; CHECK-NEXT: vpush.lo.32 x0, r10, x0 ; CHECK-NEXT: vpush.lo.32 x0, r8, x0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 +; CHECK-NEXT: vpush.lo.32 x0, r6, x0 +; CHECK-NEXT: vpush.lo.32 x0, r4, x0 +; CHECK-NEXT: vpush.lo.32 x0, r2, x0 +; CHECK-NEXT: vpush.lo.32 x0, r0, x0 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB1_3: // %cleanup +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: mov r19, r27 // Delay Slot 4 +; CHECK-NEXT: mov r18, r26 // Delay Slot 3 +; CHECK-NEXT: mov r17, r25 // Delay Slot 2 ; CHECK-NEXT: mov r16, r24 // Delay Slot 1 entry: %shuffle = shufflevector <8 x i32> %b, <8 x i32> undef, <16 x i32> diff --git a/llvm/test/CodeGen/AIE/aie2/no-tail-merge.mir b/llvm/test/CodeGen/AIE/aie2/no-tail-merge.mir new file mode 100644 index 000000000000..59e71f3e72b1 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/no-tail-merge.mir @@ -0,0 +1,134 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -mtriple=aie2 -start-before=branch-folder --stop-after=block-placement \ +# RUN: --tail-merge-size=1 %s -o - \ +# RUN: | FileCheck %s --check-prefix=TAILMERGE-DEF +# RUN: llc -mtriple=aie2 -start-before=branch-folder --stop-after=block-placement \ +# RUN: --aie-enable-tail-merge=1 --tail-merge-size=1 %s -o - \ +# RUN: | FileCheck %s --check-prefix=TAILMERGE-ON + +# This is a small test that shows how tail merging can absolutely destroy +# carefully crafted SW pipelines. If we are unlucky to have common instructions +# between a prologue and the steady state, tail merging can insert an unconditional +# jump in the middle of that loop to share the instruciton sequence. Obviously this +# is bad for AIE, because of delay slots. Complex control flow also makes scheduling +# harder. +--- +name: mini_add_swp +tracksRegLiveness: true +body: | + ; TAILMERGE-DEF-LABEL: name: mini_add_swp + ; TAILMERGE-DEF: bb.0.entry: + ; TAILMERGE-DEF-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; TAILMERGE-DEF-NEXT: liveins: $p0, $p1, $p2, $m0, $s0, $x0, $r0, $r1, $d0_3d + ; TAILMERGE-DEF-NEXT: {{ $}} + ; TAILMERGE-DEF-NEXT: $r0 = ADD_add_r_ri $r0, -4, implicit-def $srcarry + ; TAILMERGE-DEF-NEXT: $cm0, $p0 = VLDA_UPS_S32_D8_ag_pstm_nrm $s0, $p0, $m0, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; TAILMERGE-DEF-NEXT: $cm1, $p1, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s0, $p1, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; TAILMERGE-DEF-NEXT: $cm2 = VADD $cm0, $cm1, $r1 + ; TAILMERGE-DEF-NEXT: $r0 = ADD_add_r_ri $r0, -4, implicit-def $srcarry + ; TAILMERGE-DEF-NEXT: $cm0, $p0 = VLDA_UPS_S32_D8_ag_pstm_nrm $s0, $p0, $m0, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; TAILMERGE-DEF-NEXT: $cm1, $p1, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s0, $p1, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; TAILMERGE-DEF-NEXT: PseudoJZ $r0, %bb.2 + ; TAILMERGE-DEF-NEXT: {{ $}} + ; TAILMERGE-DEF-NEXT: bb.1: + ; TAILMERGE-DEF-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; TAILMERGE-DEF-NEXT: liveins: $p0, $p1, $p2, $m0, $cm0, $cm1, $cm2, $s0, $x0, $r0, $r1, $d0_3d + ; TAILMERGE-DEF-NEXT: {{ $}} + ; TAILMERGE-DEF-NEXT: $p2 = VST_SRS_D8_S32_ag_pstm_nrm_imm $p2, 32, $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; TAILMERGE-DEF-NEXT: $cm2 = VADD $cm0, $cm1, $r1 + ; TAILMERGE-DEF-NEXT: $r0 = ADD_add_r_ri $r0, -4, implicit-def $srcarry + ; TAILMERGE-DEF-NEXT: $cm0, $p0 = VLDA_UPS_S32_D8_ag_pstm_nrm $s0, $p0, $m0, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; TAILMERGE-DEF-NEXT: $cm1, $p1, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s0, $p1, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; TAILMERGE-DEF-NEXT: PseudoJNZ $r0, %bb.1 + ; TAILMERGE-DEF-NEXT: {{ $}} + ; TAILMERGE-DEF-NEXT: bb.2: + ; TAILMERGE-DEF-NEXT: liveins: $p2, $cm0, $cm1, $cm2, $s0, $r1 + ; TAILMERGE-DEF-NEXT: {{ $}} + ; TAILMERGE-DEF-NEXT: $p2 = VST_SRS_D8_S32_ag_pstm_nrm_imm $p2, 32, $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; TAILMERGE-DEF-NEXT: $cm2 = VADD $cm0, $cm1, $r1 + ; TAILMERGE-DEF-NEXT: $p2 = VST_SRS_D8_S32_ag_pstm_nrm_imm $p2, 32, $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; TAILMERGE-DEF-NEXT: PseudoRET implicit $lr + ; + ; TAILMERGE-ON-LABEL: name: mini_add_swp + ; TAILMERGE-ON: bb.0.entry: + ; TAILMERGE-ON-NEXT: successors: %bb.2(0x80000000) + ; TAILMERGE-ON-NEXT: liveins: $p0, $p1, $p2, $m0, $s0, $x0, $r0, $r1, $d0_3d + ; TAILMERGE-ON-NEXT: {{ $}} + ; TAILMERGE-ON-NEXT: $r0 = ADD_add_r_ri $r0, -4, implicit-def $srcarry + ; TAILMERGE-ON-NEXT: $cm0, $p0 = VLDA_UPS_S32_D8_ag_pstm_nrm $s0, $p0, $m0, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; TAILMERGE-ON-NEXT: $cm1, $p1, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s0, $p1, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; TAILMERGE-ON-NEXT: {{ $}} + ; TAILMERGE-ON-NEXT: bb.2: + ; TAILMERGE-ON-NEXT: successors: %bb.3(0x07878788), %bb.1(0x78787878) + ; TAILMERGE-ON-NEXT: liveins: $p2, $d0_3d, $p0, $r1, $s0, $p1, $x0, $r0, $cm0, $cm1 + ; TAILMERGE-ON-NEXT: {{ $}} + ; TAILMERGE-ON-NEXT: $cm2 = VADD $cm0, $cm1, $r1 + ; TAILMERGE-ON-NEXT: $r0 = ADD_add_r_ri $r0, -4, implicit-def $srcarry + ; TAILMERGE-ON-NEXT: $cm0, $p0 = VLDA_UPS_S32_D8_ag_pstm_nrm $s0, $p0, $m0, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + ; TAILMERGE-ON-NEXT: $cm1, $p1, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s0, $p1, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + ; TAILMERGE-ON-NEXT: PseudoJZ $r0, %bb.3 + ; TAILMERGE-ON-NEXT: {{ $}} + ; TAILMERGE-ON-NEXT: bb.1: + ; TAILMERGE-ON-NEXT: successors: %bb.2(0x80000000) + ; TAILMERGE-ON-NEXT: liveins: $p0, $p1, $p2, $m0, $cm0, $cm1, $cm2, $s0, $x0, $r0, $r1, $d0_3d + ; TAILMERGE-ON-NEXT: {{ $}} + ; TAILMERGE-ON-NEXT: $p2 = VST_SRS_D8_S32_ag_pstm_nrm_imm $p2, 32, $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; TAILMERGE-ON-NEXT: PseudoJ_jump_imm %bb.2 + ; TAILMERGE-ON-NEXT: {{ $}} + ; TAILMERGE-ON-NEXT: bb.3: + ; TAILMERGE-ON-NEXT: liveins: $p2, $cm0, $cm1, $cm2, $s0, $r1 + ; TAILMERGE-ON-NEXT: {{ $}} + ; TAILMERGE-ON-NEXT: $p2 = VST_SRS_D8_S32_ag_pstm_nrm_imm $p2, 32, $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; TAILMERGE-ON-NEXT: $cm2 = VADD $cm0, $cm1, $r1 + ; TAILMERGE-ON-NEXT: $p2 = VST_SRS_D8_S32_ag_pstm_nrm_imm $p2, 32, $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + ; TAILMERGE-ON-NEXT: PseudoRET implicit $lr + + bb.0.entry: + liveins: $p0, $p1, $p2, $m0, $s0, $x0, $r0, $r1, $d0_3d + + $r0 = ADD_add_r_ri $r0, -4, implicit-def $srcarry + $cm0, $p0 = VLDA_UPS_S32_D8_ag_pstm_nrm $s0, $p0, $m0, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + $cm1, $p1, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s0, $p1, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + PseudoJ_jump_imm %bb.4 + + bb.4: + liveins: $p0, $p1, $p2, $m0, $cm0, $cm1, $s0, $x0, $r0, $r1, $d0_3d + $cm2 = VADD $cm0, $cm1, $r1 + $r0 = ADD_add_r_ri $r0, -4, implicit-def $srcarry + $cm0, $p0 = VLDA_UPS_S32_D8_ag_pstm_nrm $s0, $p0, $m0, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + $cm1, $p1, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s0, $p1, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + + PseudoJNZ $r0, %bb.1 + PseudoJ_jump_imm %bb.2 + + bb.1: + liveins: $p0, $p1, $p2, $m0, $cm0, $cm1, $cm2, $s0, $x0, $r0, $r1, $d0_3d + successors: %bb.2(0x04000000), %bb.1(0x7c000000) + + $p2 = VST_SRS_D8_S32_ag_pstm_nrm_imm $p2, 32, $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + $cm2 = VADD $cm0, $cm1, $r1 + $r0 = ADD_add_r_ri $r0, -4, implicit-def $srcarry + $cm0, $p0 = VLDA_UPS_S32_D8_ag_pstm_nrm $s0, $p0, $m0, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64) + $cm1, $p1, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s0, $p1, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32) + PseudoJNZ $r0, %bb.1 + PseudoJ_jump_imm %bb.2 + + bb.2: + liveins: $p2, $cm0, $cm1, $cm2, $s0, $r1 + $p2 = VST_SRS_D8_S32_ag_pstm_nrm_imm $p2, 32, $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + $cm2 = VADD $cm0, $cm1, $r1 + + bb.3: + liveins: $p2, $cm2, $s0 + $p2 = VST_SRS_D8_S32_ag_pstm_nrm_imm $p2, 32, $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<32 x s8>) into stack - 128) + + bb.5: + PseudoRET implicit $lr +... diff --git a/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow-3d.mir b/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow-3d.mir index 53d67b09d727..7f92425e94a4 100644 --- a/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow-3d.mir +++ b/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow-3d.mir @@ -340,13 +340,19 @@ body: | ; CHECK-NEXT: DelayedSchedBarrier ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: successors: ; CHECK-NEXT: liveins: $dc4, $dj0, $dj4, $dn0, $dn4, $m0, $p0, $r9, $d0_3d:0x000000000001C860 ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP ; CHECK-NEXT: $dc0 = MOV_mv_scl killed $r9 + ; CHECK-NEXT: $p0, $dc0, $dc4 = PADDA_3D killed $p0, killed $d0_3d + ; CHECK-NEXT: DelayedSchedBarrier implicit killed renamable $dc0, implicit killed renamable $p0, implicit killed renamable $dc4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2 (align 16): - ; CHECK-NEXT: liveins: $p0, $d0_3d + ; CHECK-NEXT: liveins: $dc0, $dc4, $dj0, $dj4, $dn0, $dn4, $m0, $p0, $d0_3d:0x000000000001C870 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: RET implicit $lr ; CHECK-NEXT: NOP diff --git a/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow.mir b/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow.mir index 672441a99de0..85415fe8250a 100644 --- a/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow.mir +++ b/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-flow.mir @@ -38,7 +38,6 @@ body: | %3:edc = COPY $r3 %4:ep = COPY $p0 - ; ISel code for %20(p0), %30(i20) = G_INTRINSIC(add_2d_byte) %4, %0, %1, %2, %3 %10:ed = REG_SEQUENCE %0, %subreg.sub_mod, %1, %subreg.sub_dim_size, %2, %subreg.sub_dim_stride, %3, %subreg.sub_dim_count %20:ep, %30:edc = PADDA_2D %4, %10 @@ -110,15 +109,12 @@ body: | %4:ep = COPY $p0 %5:ep = COPY $p1 - ; ISel code for: %20(p0), %12(i20) = G_INTRINSIC(add_2d_byte) %4, %0, %1, %2, %3 %10:ed = REG_SEQUENCE %0, %subreg.sub_mod, %1, %subreg.sub_dim_size, %2, %subreg.sub_dim_stride, %3, %subreg.sub_dim_count %20:ep, %12:edc = PADDA_2D %4, %10 - ; ISel code for: %40(p0), %32(i20) = G_INTRINSIC(add_2d_byte) %20, %0, %1, %2, %12 %30:ed = REG_SEQUENCE %0, %subreg.sub_mod, %1, %subreg.sub_dim_size, %2, %subreg.sub_dim_stride, %12, %subreg.sub_dim_count %40:ep, %32:edc = PADDA_2D %20, %30 - ; Return the twice-incremented pointer. PseudoRET implicit $lr, implicit %40, implicit $m0 ... @@ -156,19 +152,15 @@ body: | %4:ep = COPY $p0 %5:ep = COPY $p1 - ; ISel code for: %200(p0), %300(i20) = G_INTRINSIC(add_2d_byte) %4, %0, %1, %2, %3 %100:ed = REG_SEQUENCE %0, %subreg.sub_mod, %1, %subreg.sub_dim_size, %2, %subreg.sub_dim_stride, %3, %subreg.sub_dim_count %200:ep, %300:edc = PADDA_2D %4, %100 - ; ISel code for: %201(p0), %301(i20) = G_INTRINSIC(add_2d_byte) %5, %0, %1, %2, %3 %101:ed = REG_SEQUENCE %0, %subreg.sub_mod, %1, %subreg.sub_dim_size, %2, %subreg.sub_dim_stride, %3, %subreg.sub_dim_count %201:ep, %301:edc = PADDA_2D %5, %101 - ; ISel code for: %202(p0), %302(i20) = G_INTRINSIC(add_2d_byte) %200, %0, %1, %2, %300 %102:ed = REG_SEQUENCE %0, %subreg.sub_mod, %1, %subreg.sub_dim_size, %2, %subreg.sub_dim_stride, %300, %subreg.sub_dim_count %202:ep, %302:edc = PADDA_2D %200, %102 - ; Return the pointers. PseudoRET implicit $lr, implicit %201, implicit %202, implicit $m0 ... @@ -219,23 +211,18 @@ body: | %8:edj = COPY $r6 %9:edc = COPY $r7 - ; ISel code for: %200(p0), %300(i20) = G_INTRINSIC(add_2d_byte) %4, %0, %1, %2, %3 %100:ed = REG_SEQUENCE %0, %subreg.sub_mod, %1, %subreg.sub_dim_size, %2, %subreg.sub_dim_stride, %3, %subreg.sub_dim_count %200:ep, %300:edc = PADDA_2D %4, %100 - ; ISel code for: %201(p0), %301(i20) = G_INTRINSIC(add_2d_byte) %5, %0, %1, %2, %3 %101:ed = REG_SEQUENCE %0, %subreg.sub_mod, %1, %subreg.sub_dim_size, %2, %subreg.sub_dim_stride, %3, %subreg.sub_dim_count %201:ep, %301:edc = PADDA_2D %5, %101 - ; ISel code for: %202(p0), %302(i20) = G_INTRINSIC(add_2d_byte) %5, %6, %7, %8, %9 %102:ed = REG_SEQUENCE %6, %subreg.sub_mod, %7, %subreg.sub_dim_size, %8, %subreg.sub_dim_stride, %9, %subreg.sub_dim_count %202:ep, %302:edc = PADDA_2D %5, %102 - ; ISel code for: %203(p0), %303(i20) = G_INTRINSIC(add_2d_byte) %200, %0, %1, %2, %300 %103:ed = REG_SEQUENCE %0, %subreg.sub_mod, %1, %subreg.sub_dim_size, %2, %subreg.sub_dim_stride, %301, %subreg.sub_dim_count %203:ep, %303:edc = PADDA_2D %200, %103 - ; Return the twice-incremented pointer. PseudoRET implicit $lr, implicit %203, implicit %202, implicit $m0, implicit $d2 , implicit $d3 , implicit $d4 , implicit $d5 , implicit $d6 ... @@ -459,13 +446,19 @@ body: | ; CHECK-NEXT: DelayedSchedBarrier ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: successors: ; CHECK-NEXT: liveins: $d0:0x0000000000000860, $dj0, $dn0, $m0, $p0, $r5 ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP ; CHECK-NEXT: $dc0 = MOV_mv_scl killed $r5 + ; CHECK-NEXT: $p0, $dc0 = PADDA_2D killed $p0, killed $d0 + ; CHECK-NEXT: DelayedSchedBarrier implicit killed renamable $dc0, implicit killed renamable $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2 (align 16): - ; CHECK-NEXT: liveins: $p0, $d0 + ; CHECK-NEXT: liveins: $d0:0x0000000000000870, $dc0, $dj0, $dn0, $m0, $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: RET implicit $lr ; CHECK-NEXT: NOP From 791ce44c061000f8694f5601ed69201edb876fe5 Mon Sep 17 00:00:00 2001 From: Martien de Jong Date: Tue, 16 Jul 2024 15:48:08 +0200 Subject: [PATCH 23/31] [AIE] Baseline test for non-dedicated exit --- .../loopaware/non-dedicated-exit-nops.mir | 104 ++++++++++++++++++ .../schedule/loopaware/non-dedicated-exit.mir | 97 ++++++++++++++++ 2 files changed, 201 insertions(+) create mode 100644 llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit-nops.mir create mode 100644 llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit.mir diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit-nops.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit-nops.mir new file mode 100644 index 000000000000..f59bb07ad38e --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit-nops.mir @@ -0,0 +1,104 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 + +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# We have a loop with an epilogue shared by a guard branch that needs nops +# in the epilogue + + +# RUN: llc --mtriple=aie2 --run-pass=postmisched %s -o - | FileCheck %s +--- +name: cpy +alignment: 16 +exposesReturnsTwice: false +legalized: true +regBankSelected: true +selected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: cpy + ; CHECK: bb.0.entry (align 16): + ; CHECK-NEXT: successors: %bb.1(0x50000000), %bb.3(0x30000000) + ; CHECK-NEXT: liveins: $p0, $p1, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r1 = MOVA_lda_cg 0 + ; CHECK-NEXT: $r1 = GE killed $r1, $r0 + ; CHECK-NEXT: JNZ $r1, %bb.3 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $p0, $p1, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK-NEXT: liveins: $p0, $p1, $p2, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $p1 = PADDA_lda_ptr_inc_idx_imm killed $p1, 4 + ; CHECK-NEXT: $p1 = PADDA_lda_ptr_inc_idx_imm killed $p1, 4 + ; CHECK-NEXT: BUNDLE implicit-def $r1, implicit-def $r0, implicit-def $srcarry, implicit $p1, implicit killed $r0 { + ; CHECK-NEXT: $r1 = LDA_dms_lda_idx_imm $p1, 4 + ; CHECK-NEXT: $r0 = ADD_add_r_ri killed $r0, -1, implicit-def $srcarry + ; CHECK-NEXT: } + ; CHECK-NEXT: JNZ $r0, %bb.2 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: liveins: $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BUNDLE implicit-def $r0, implicit $lr, implicit killed $r0, implicit killed $r1 { + ; CHECK-NEXT: RET implicit $lr, implicit killed $r0 + ; CHECK-NEXT: $r0 = MOV_mv_scl killed $r1 + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE killed $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + bb.2: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + liveins: $p0, $p1, $p2, $r0 + + $p1 = PADDA_lda_ptr_inc_idx_imm $p1, 4 + $p1 = PADDA_lda_ptr_inc_idx_imm $p1, 4 + $r1 = LDA_dms_lda_idx_imm $p1, 4 + + $r0 = ADD_add_r_ri $r0, -1, implicit-def $srcarry + JNZ $r0, %bb.2 + DelayedSchedBarrier + + bb.3: + liveins: $r1 + $r0 = MOV_mv_scl $r1 + RET implicit $lr, implicit $r0 + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit.mir new file mode 100644 index 000000000000..def02f66e1ac --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit.mir @@ -0,0 +1,97 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 + +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + + +# We have a loop with an epilogue shared by a guard branch + +# RUN: llc --mtriple=aie2 --run-pass=postmisched %s -o - | FileCheck %s +--- +name: cpy +alignment: 16 +exposesReturnsTwice: false +legalized: true +regBankSelected: true +selected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: cpy + ; CHECK: bb.0.entry (align 16): + ; CHECK-NEXT: successors: %bb.1(0x50000000), %bb.3(0x30000000) + ; CHECK-NEXT: liveins: $p0, $p1, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r1 = MOVA_lda_cg 0 + ; CHECK-NEXT: $r1 = GE killed $r1, $r0 + ; CHECK-NEXT: JNZ killed $r1, %bb.3 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $p0, $p1, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r0 = ADD_NC_GPR killed $r0, -1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK-NEXT: liveins: $p0, $p1, $p2, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r1, $p1 = LDA_dms_lda_pstm_nrm_imm killed $p1, 4 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $r0 = ADD_add_r_ri killed $r0, -1, implicit-def $srcarry + ; CHECK-NEXT: JNZ $r0, %bb.2 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $r1 = nsw ADD_add_r_ri killed $r1, 1, implicit-def $srcarry + ; CHECK-NEXT: $p0 = ST_dms_sts_pstm_nrm_imm killed $r1, killed $p0, 4 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $r0 = ADD_NC_GPR $r0, -1 + + bb.2: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + liveins: $p0, $p1, $p2, $r0 + + $r1, $p1 = LDA_dms_lda_pstm_nrm_imm $p1, 4 + $r1 = nsw ADD_add_r_ri $r1, 1, implicit-def $srcarry + $p0 = ST_dms_sts_pstm_nrm_imm $r1, $p0, 4 + $r0 = ADD_add_r_ri $r0, -1, implicit-def $srcarry + JNZ $r0,%bb.2 + DelayedSchedBarrier + + bb.3: + RET implicit $lr + DelayedSchedBarrier + +... From b6f79260b771e9ecdb90881e94145d01d1b8cd14 Mon Sep 17 00:00:00 2001 From: Martien de Jong Date: Wed, 17 Jul 2024 10:38:14 +0200 Subject: [PATCH 24/31] [AIE] Accept more cases for loop-aware scheduling. --- .../Target/AIE/AIEInterBlockScheduling.cpp | 83 +++++++++++++++---- .../test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll | 7 ++ .../AIE/aie2/hardware-loops/zol-loop.ll | 7 +- .../aie2/schedule/loopaware/loop-classify.mir | 45 ++++++++-- .../loopaware/non-dedicated-exit-nops.mir | 19 +++-- .../schedule/loopaware/non-dedicated-exit.mir | 3 +- .../AIE/aie2/schedule/swp/doloop-stage0.ll | 61 +++++++++++--- .../CodeGen/AIE/aie2/schedule/swp/stage0.ll | 30 ++++--- 8 files changed, 194 insertions(+), 61 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index b56c73ddc9cd..d185c47120db 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -85,10 +85,18 @@ void emitBundlesInScoreboardDelta( } } -MachineBasicBlock *getSinglePredecessor(const MachineBasicBlock &MBB) { - assert(MBB.pred_size() == 1 && "MBB contains more than 1 predecessor"); - MachineBasicBlock *SinglePredMBB = *MBB.predecessors().begin(); - return SinglePredMBB; +MachineBasicBlock *getLoopPredecessor(const MachineBasicBlock &MBB) { + if (MBB.pred_size() == 1) { + // if we have only one, it must be the loop + return *MBB.predecessors().begin(); + } + // Otherwise, the loop is the fallthrough predecessor by construction + for (auto *Pred : MBB.predecessors()) { + if (Pred->isLayoutSuccessor(&MBB)) { + return Pred; + } + } + return nullptr; } InterBlockScheduling::InterBlockScheduling(const MachineSchedContext *C, @@ -443,6 +451,25 @@ void InterBlockScheduling::enterRegion(MachineBasicBlock *BB, BS.addRegion(BB, RegionBegin, RegionEnd); } } + +// Create a block, insert it before Succ, and route the control flow edge +// between Pred and Succ through it. +// Since we don't add any control flow instructions, the edge should be a +// fallthrough edge; it will be replaced with two fallthrough edges and a block +MachineBasicBlock *splitEdge(MachineBasicBlock *Pred, MachineBasicBlock *Succ) { + auto *MF = Pred->getParent(); + MachineBasicBlock *NewBB = MF->CreateMachineBasicBlock(Succ->getBasicBlock()); + MF->insert(Succ->getIterator(), NewBB); + for (auto *Edge : make_early_inc_range(Pred->successors())) { + if (Edge == Succ) { + Pred->removeSuccessor(Succ); + } + } + NewBB->addSuccessor(Succ); + Pred->addSuccessor(NewBB); + return NewBB; +} + int InterBlockScheduling::getNumEntryNops(const BlockState &BS) const { // Epilogues should supply the safety margin for their loop. // That loop is the only predecessor by construction of @@ -450,32 +477,53 @@ int InterBlockScheduling::getNumEntryNops(const BlockState &BS) const { if (BS.Kind != BlockType::Epilogue) { return 0; } - const MachineBasicBlock &BB = *BS.TheBlock; - assert(BB.pred_size() == 1); - MachineBasicBlock *Loop = getSinglePredecessor(BB); + MachineBasicBlock &BB = *BS.TheBlock; + + MachineBasicBlock *Loop = getLoopPredecessor(BB); + assert(Loop); auto &LBS = getBlockState(Loop); + if (BB.pred_size() > 1) { + // The loop is a fallthrough predecessor by construction. We insert a + // new block that will be a dedicated exit to the loop. + } // We can only analyze non-empty epilogue blocks because we need // to build a DDG, which is not possible. // For empty ones, we need to be conservative because we are not aware of // content of epilogues' successor. + int SafetyMargin = LBS.getSafetyMargin(); if (LoopEpilogueAnalysis && BB.size() > 0) { int ExistingLatency = getCyclesToRespectTiming(BS, LBS); // Start the next step only after clearing latencies. - return getCyclesToAvoidResourceConflicts(ExistingLatency, BS, LBS); + SafetyMargin = getCyclesToAvoidResourceConflicts(ExistingLatency, BS, LBS); + } + if (SafetyMargin && BB.pred_size() > 1) { + DEBUG_LOOPAWARE(dbgs() << "New dedicated exit with " << SafetyMargin + << " nops.\n"); + // The loop is a fallthrough predecessor by construction. We insert a + // new block that will be a dedicated exit to the loop. + MachineBasicBlock *DedicatedExit = splitEdge(Loop, &BB); + const auto &SubTarget = BS.TheBlock->getParent()->getSubtarget(); + auto *TII = static_cast(SubTarget.getInstrInfo()); + // Our caller can't see this block, so we fill nops ourselves. The original + // epilogue will not need entry nops, so we can return 0. + auto It = DedicatedExit->begin(); + while (SafetyMargin--) { + TII->insertNoop(*DedicatedExit, It); + } + return 0; } - return LBS.getSafetyMargin(); + return SafetyMargin; } int InterBlockScheduling::getCyclesToRespectTiming( const BlockState &EpilogueBS, const BlockState &LoopBS) const { const MachineBasicBlock &EpilogueMBB = *EpilogueBS.TheBlock; - const MachineBasicBlock *LoopMBB = getSinglePredecessor(EpilogueMBB); DEBUG_LOOPAWARE(dbgs() << "** Loop/Epilogue-carried latency dependencies:" - << " Original Loop " << *LoopMBB + << " Original Loop " << *LoopBS.TheBlock << " Original Epilogue " << EpilogueMBB << "\n"); InterBlockEdges Edges(*Context); @@ -654,7 +702,7 @@ void BlockState::classify() { // We must push the safety margin to our epilogue block(s) // This can only be done if we have an epilogue and the epilogue is not itself // a loop. - auto IsLoop = [](MachineBasicBlock *MBB) { + auto IsLoop = [](const MachineBasicBlock *MBB) { int NumLoopEdges = 0; int NumExitEdges = 0; for (auto *S : MBB->successors()) { @@ -667,10 +715,13 @@ void BlockState::classify() { return NumLoopEdges == 1 && NumExitEdges == 1; }; // We generalize slightly; we require the epilogue to be a dedicated exit of - // the loop. - auto CanFixLoopSchedule = [L = TheBlock](auto *S) { - // Either the backedge, or a dedicated loop exit - return S == L || S->pred_size() == 1; + // the loop, or a fallthrough block, so that we can squeeze in a dedicated + // exit. + auto CanFixLoopSchedule = [L = TheBlock, + &IsLoop](const MachineBasicBlock *S) { + // Either the backedge, or a dedicated loop exit, or a fallthrough loop exit + return S == L || S->pred_size() == 1 || + (L->isLayoutSuccessor(S) && !IsLoop(S)); }; // If we don't mark up any loops, we will iterate in the same order and apply diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll index 8bbe2b744c83..096219363439 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll @@ -1,5 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -O2 -mtriple=aie2 --enable-pipeliner=0 %s -o - | FileCheck %s +; +; This file is licensed under the Apache License v2.0 with LLVM Exceptions. +; See https://llvm.org/LICENSE.txt for license information. +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +; +; (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates + ; RUN: opt -mtriple=aie2 -passes=aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AA ; A reduced example from MLLib's mul2d benchmark. diff --git a/llvm/test/CodeGen/AIE/aie2/hardware-loops/zol-loop.ll b/llvm/test/CodeGen/AIE/aie2/hardware-loops/zol-loop.ll index 39ed8f318910..afe8e7a00343 100644 --- a/llvm/test/CodeGen/AIE/aie2/hardware-loops/zol-loop.ll +++ b/llvm/test/CodeGen/AIE/aie2/hardware-loops/zol-loop.ll @@ -35,17 +35,16 @@ define void @simple_loop(i32 noundef %n, ptr nocapture readonly %in, ptr nocaptu ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lda r3, [p0, #0] -; CHECK-NEXT: nop +; CHECK-NEXT: lda r3, [p0, #0]; nopb ; nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: lshl r4, r1, r0 +; CHECK-NEXT: add r1, r1, #1 ; CHECK-NEXT: add r3, r2, r3; mov dj0, r4 -; CHECK-NEXT: st r3, [p1, dj0]; add r1, r1, #1 ; CHECK-NEXT: .L_LEnd0: -; CHECK-NEXT: nopb ; nopa ; nops ; add r2, r2, #-1; nopm ; nopv +; CHECK-NEXT: nopb ; nopa ; st r3, [p1, dj0]; add r2, r2, #-1; nopm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup ; CHECK-NEXT: nopa ; ret lr diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-classify.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-classify.mir index ffb7039be6f4..405d66245d86 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-classify.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-classify.mir @@ -37,14 +37,14 @@ body: | DelayedSchedBarrier ... -# bb.2 is not a dedicated exit of the loop bb.1, and can therefore not -# serve as an epiloge. Hence bb.1 can not be considered as a loop +# bb.2 is not a dedicated exit of the loop bb.1, but it is a fallthrough +# successor. Hence we can insert the safety margin in the fallthrough. --- -name: nonDedicatedEpi +name: safeNonDedicatedEpi tracksRegLiveness: true body: | - ; CHECK-LABEL: nonDedicatedEpi - ; CHECK: MBB scheduling sequence : 3 -> 2 -> 1 -> 0 + ; CHECK-LABEL: safeNonDedicatedEpi + ; CHECK: MBB scheduling sequence : 1 -> 3 -> 2 -> 0 bb.0: liveins: $r0 successors: %bb.1, %bb.2 @@ -66,5 +66,38 @@ body: | DelayedSchedBarrier ... - +# bb.3 is non-dedicated, non fallthrough epilogue +# It can not be used to hold the safety margin +# Note that the relative ordering of 1 and 2 is is not +# deterministic under postorder constraints +--- +name: unsafeNonDedicatedEpi +tracksRegLiveness: true +body: | + ; CHECK-LABEL: unsafeNonDedicatedEpi + ; CHECK: MBB scheduling sequence : 4 -> 3 -> {{[12]}} -> {{[12]}} -> 0 + bb.0: + liveins: $r0 + successors: %bb.1, %bb.2 + JNZ $r0, %bb.3 + DelayedSchedBarrier + bb.1: + liveins: $r0 + successors: %bb.1, %bb.3 + $r0 = ADD_add_r_ri $r0, -1, implicit-def $srcarry + JNZ $r0, %bb.1 + DelayedSchedBarrier + J_jump_imm %bb.3 + DelayedSchedBarrier + bb.2: + successors: %bb.3 + bb.3: + liveins: $r0 + successors: %bb.4 + $r0 = ADD_add_r_ri $r0, -1, implicit-def $srcarry + bb.4: + liveins: $r0 + RET implicit $lr, implicit $r0 + DelayedSchedBarrier +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit-nops.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit-nops.mir index f59bb07ad38e..497877795d49 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit-nops.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit-nops.mir @@ -40,22 +40,27 @@ body: | ; CHECK-NEXT: liveins: $p0, $p1, $r0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.4(0x04000000) ; CHECK-NEXT: liveins: $p0, $p1, $p2, $r0 ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r0 = ADD_add_r_ri killed $r0, -1, implicit-def $srcarry + ; CHECK-NEXT: JNZ $r0, %bb.2 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP ; CHECK-NEXT: $p1 = PADDA_lda_ptr_inc_idx_imm killed $p1, 4 ; CHECK-NEXT: $p1 = PADDA_lda_ptr_inc_idx_imm killed $p1, 4 - ; CHECK-NEXT: BUNDLE implicit-def $r1, implicit-def $r0, implicit-def $srcarry, implicit $p1, implicit killed $r0 { - ; CHECK-NEXT: $r1 = LDA_dms_lda_idx_imm $p1, 4 - ; CHECK-NEXT: $r0 = ADD_add_r_ri killed $r0, -1, implicit-def $srcarry - ; CHECK-NEXT: } - ; CHECK-NEXT: JNZ $r0, %bb.2 + ; CHECK-NEXT: $r1 = LDA_dms_lda_idx_imm $p1, 4 + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP - ; CHECK-NEXT: DelayedSchedBarrier ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: liveins: $r1 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit.mir index def02f66e1ac..f4cfeda75129 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/non-dedicated-exit.mir @@ -46,14 +46,13 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $r1, $p1 = LDA_dms_lda_pstm_nrm_imm killed $p1, 4 ; CHECK-NEXT: NOP - ; CHECK-NEXT: NOP ; CHECK-NEXT: $r0 = ADD_add_r_ri killed $r0, -1, implicit-def $srcarry ; CHECK-NEXT: JNZ $r0, %bb.2 ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP ; CHECK-NEXT: $r1 = nsw ADD_add_r_ri killed $r1, 1, implicit-def $srcarry ; CHECK-NEXT: $p0 = ST_dms_sts_pstm_nrm_imm killed $r1, killed $p0, 4 - ; CHECK-NEXT: NOP ; CHECK-NEXT: DelayedSchedBarrier ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll b/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll index 8c24125d887b..098bf4aab1b6 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll +++ b/llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; ; This file is licensed under the Apache License v2.0 with LLVM Exceptions. ; See https://llvm.org/LICENSE.txt for license information. @@ -14,20 +15,54 @@ ; using padda rather than the padd pseudo. This is awaiting proper handling ; of pseudos in pre-RA scheduling/pipelining -; CHECK-LABEL dot: -; CHECK: add [[LC:r[0-9]+]], r1, #-1 -; CHECK: jz [[LC]], #.LBB0_4 - -; CHECK: add [[LC]], [[LC]], #-1 -; CHECK: jz [[LC]], #.LBB0_3 - -; CHECK:.LBB0_2: -; CHECK: add [[LC]], [[LC]], #-1 -; CHECK: jnz [[LC]], #.LBB0_2 -; CHECK:.LBB0_3: -; CHECK:.LBB0_4: - define dso_local i32 @dot(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 noundef %n) { +; CHECK-LABEL: dot: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: nopa ; add r5, r1, #-1 +; CHECK-NEXT: jz r5, #.LBB0_5 +; CHECK-NEXT: lda r2, [p0, #0] // Delay Slot 5 +; CHECK-NEXT: padda [p0], #2044 // Delay Slot 4 +; CHECK-NEXT: lda r3, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: padda [p1], #2044 // Delay Slot 2 +; CHECK-NEXT: mova r0, #0 // Delay Slot 1 +; CHECK-NEXT: // %bb.1: // %do.body +; CHECK-NEXT: lda r1, [p0, #0]; nopx +; CHECK-NEXT: lda r4, [p1, #0]; add r5, r5, #-1 +; CHECK-NEXT: jz r5, #.LBB0_4 +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: padda [p0], #2044 // Delay Slot 2 +; CHECK-NEXT: padda [p1], #2044 // Delay Slot 1 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_2: // %do.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: nopa ; nopb ; add r5, r5, #-1; nopm +; CHECK-NEXT: jnz r5, #.LBB0_2 +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: lda r1, [p0, #0] // Delay Slot 4 +; CHECK-NEXT: padda [p0], #2044 // Delay Slot 3 +; CHECK-NEXT: lda r4, [p1, #0]; and r6, r3, r2; mov r3, r4 // Delay Slot 2 +; CHECK-NEXT: padda [p1], #2044; or r0, r6, r0; mov r2, r1 // Delay Slot 1 +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: nopa ; nopb ; nopx +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: and r2, r3, r2; mov r3, r4 +; CHECK-NEXT: or r0, r2, r0; mov r2, r1 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_5: +; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv +; CHECK-NEXT: nopx // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: and r1, r3, r2 // Delay Slot 2 +; CHECK-NEXT: or r0, r1, r0 // Delay Slot 1 entry: br label %do.body diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/swp/stage0.ll b/llvm/test/CodeGen/AIE/aie2/schedule/swp/stage0.ll index cf7f63572026..cfac58a870c2 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/swp/stage0.ll +++ b/llvm/test/CodeGen/AIE/aie2/schedule/swp/stage0.ll @@ -19,7 +19,7 @@ define dso_local i32 @dot(ptr nocapture readonly %a, ptr nocapture readonly %b, ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mova r0, #0; nopb ; nopxm ; CHECK-NEXT: ge r2, r0, r1 -; CHECK-NEXT: jnz r2, #.LBB0_6 +; CHECK-NEXT: jnz r2, #.LBB0_7 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 @@ -28,7 +28,7 @@ define dso_local i32 @dot(ptr nocapture readonly %a, ptr nocapture readonly %b, ; CHECK-NEXT: // %bb.1: // %for.body ; CHECK-NEXT: lda r2, [p0, #0]; nopx ; CHECK-NEXT: lda r3, [p1, #0]; add r5, r1, #-1 -; CHECK-NEXT: jz r5, #.LBB0_5 +; CHECK-NEXT: jz r5, #.LBB0_6 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 @@ -37,7 +37,7 @@ define dso_local i32 @dot(ptr nocapture readonly %a, ptr nocapture readonly %b, ; CHECK-NEXT: // %bb.2: // %for.body ; CHECK-NEXT: lda r1, [p0, #0]; nopx ; CHECK-NEXT: lda r4, [p1, #0]; add r5, r5, #-1 -; CHECK-NEXT: jz r5, #.LBB0_4 +; CHECK-NEXT: jz r5, #.LBB0_5 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 @@ -46,25 +46,29 @@ define dso_local i32 @dot(ptr nocapture readonly %a, ptr nocapture readonly %b, ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_3: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lda r1, [p0, #0]; nopxm -; CHECK-NEXT: nop -; CHECK-NEXT: lda r4, [p1, #0]; add r5, r5, #-1 +; CHECK-NEXT: nopa ; nopb ; add r5, r5, #-1; nopm ; CHECK-NEXT: jnz r5, #.LBB0_3 ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: and r6, r3, r2; mov r2, r1 // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: padda [p0], #2044; mov r3, r4 // Delay Slot 2 -; CHECK-NEXT: padda [p1], #2044; or r0, r6, r0 // Delay Slot 1 +; CHECK-NEXT: lda r1, [p0, #0] // Delay Slot 4 +; CHECK-NEXT: padda [p0], #2044 // Delay Slot 3 +; CHECK-NEXT: lda r4, [p1, #0]; and r6, r3, r2; mov r3, r4 // Delay Slot 2 +; CHECK-NEXT: padda [p1], #2044; or r0, r6, r0; mov r2, r1 // Delay Slot 1 +; CHECK-NEXT: // %bb.4: +; CHECK-NEXT: nopa ; nopb ; nopx +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: .LBB0_5: ; CHECK-NEXT: and r2, r3, r2; mov r3, r4 ; CHECK-NEXT: or r0, r2, r0; mov r2, r1 ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB0_5: +; CHECK-NEXT: .LBB0_6: ; CHECK-NEXT: nopa ; nopb ; and r1, r3, r2; nopm ; CHECK-NEXT: or r0, r1, r0 ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB0_6: // %for.cond.cleanup +; CHECK-NEXT: .LBB0_7: // %for.cond.cleanup ; CHECK-NEXT: nopa ; ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 From f39b0dcf4ff97813ad5a8e92170a294e4c611aaa Mon Sep 17 00:00:00 2001 From: Krishnam Tibrewala Date: Thu, 15 Aug 2024 12:08:23 -0700 Subject: [PATCH 25/31] [AIE2] Offload Vector Loads to SlotB --- llvm/lib/Target/AIE/AIE2InstrInfo.cpp | 2 + llvm/lib/Target/AIE/AIE2InstrInfo.td | 10 +- llvm/lib/Target/AIE/AIE2InstrPatterns.td | 5 +- .../Target/AIE/AIE2InstructionSelector.cpp | 82 +- .../AIE/AIE2MultiSlotPseudoInstrInfo.td | 81 +- llvm/lib/Target/AIE/AIEHazardRecognizer.cpp | 8 + .../GlobalISel/indexed-512-load-store.mir | 75 +- .../inst-select-indexed-load-store.mir | 254 +++- .../inst-select-no-combine-vldb_unpack.mir | 10 +- .../GlobalISel/inst-select-no-combine.mir | 4 +- .../inst-select-postinc-2d-vlda_ups.mir | 6 +- .../inst-select-pre-post-increment.mir | 342 +++++- .../GlobalISel/inst-select-vector-load.mir | 78 +- .../aie2/GlobalISel/inst-select-vlda_conv.mir | 10 +- .../inst-select-vlda_ups-unsafe-to-move.mir | 4 +- .../aie2/GlobalISel/inst-select-vldst-mmo.mir | 52 +- .../vld-ups-combine-store-after-load.mir | 4 +- .../vld-ups-combine-use-before-def.mir | 6 +- .../AIE/aie2/end-to-end/Conv2D-red-swp.ll | 96 +- .../CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll | 16 +- .../CodeGen/AIE/aie2/end-to-end/Memops.ll | 2 +- .../test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll | 8 +- llvm/test/CodeGen/AIE/aie2/mmo-load.ll | 8 +- .../AIE/aie2/ra/split-instrs-create.mir | 45 + .../resource/memory_bank_vld_multi_slot.mir | 1083 +++++++++++++++++ llvm/test/CodeGen/AIE/aie2/schedule/vld.mir | 324 +++++ .../CodeGen/AIE/aie2/schedule/vlda_vldb.mir | 19 + llvm/test/CodeGen/AIE/aie2/test-alignas.ll | 2 +- .../tied-physical-regs-match-vld-2d.mir | 9 +- .../tied-physical-regs-match-vld-3d.mir | 5 +- llvm/test/CodeGen/AIE/aie2/vst_srs.ll | 2 +- 31 files changed, 2375 insertions(+), 277 deletions(-) create mode 100644 llvm/test/CodeGen/AIE/aie2/schedule/resource/memory_bank_vld_multi_slot.mir diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp index 4bc3d07373c6..9021bf019c30 100644 --- a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp +++ b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp @@ -969,6 +969,7 @@ AIE2InstrInfo::getTiedRegInfo(unsigned Opcode) const { SubRegSplit(AIE2::sub_hi_dim_then_sub_dim_count)}; switch (Opcode) { + case AIE2::VLD_2D_pseudo: case AIE2::LDA_2D_dmv_lda_q: case AIE2::LDA_2D_dms_lda: case AIE2::LDA_2D_S8_dmhb_lda: @@ -995,6 +996,7 @@ AIE2InstrInfo::getTiedRegInfo(unsigned Opcode) const { case AIE2::LDA_3D_S16_dmhb_lda: case AIE2::LDA_3D_U8_dmhb_lda: case AIE2::LDA_3D_U16_dmhb_lda: + case AIE2::VLD_3D_pseudo: case AIE2::VLDA_3D_dmw_lda_w: case AIE2::VLDA_3D_dmw_lda_am: case AIE2::VLDA_3D_CONV_FP32_BF16: diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.td b/llvm/lib/Target/AIE/AIE2InstrInfo.td index 56898cabed23..65caa19382ef 100644 --- a/llvm/lib/Target/AIE/AIE2InstrInfo.td +++ b/llvm/lib/Target/AIE/AIE2InstrInfo.td @@ -367,6 +367,7 @@ include "AIE2GenInstrInfo.td" include "AIE2CompositeFormats.td" // Manual fixes to the auto-generated files include "AIE2GenFixupInstrInfo.td" +include "AIE2MultiSlotPseudoInstrInfo.td" //Intrinsics let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in { @@ -584,8 +585,8 @@ class Split2DInstr : SplitPseudo; @@ -608,8 +609,8 @@ class Split3DInstr : SplitPseudo; @@ -625,5 +626,4 @@ foreach instr = [VST_3D_SRS_D8_S32, VST_3D_SRS_D16_S64, VST_3D_SRS_D16_S32, PADDA_3D, PADDB_3D, PADDS_3D] in def instr # _split : Split3DInstr; -include "AIE2MultiSlotPseudoInstrInfo.td" include "AIE2InstrPatterns.td" diff --git a/llvm/lib/Target/AIE/AIE2InstrPatterns.td b/llvm/lib/Target/AIE/AIE2InstrPatterns.td index d363f40bab53..7fb1ea6c4cbd 100644 --- a/llvm/lib/Target/AIE/AIE2InstrPatterns.td +++ b/llvm/lib/Target/AIE/AIE2InstrPatterns.td @@ -242,11 +242,8 @@ def : PartWordStPat; } // 3.2 VLD - Vector Load -class VectorLdPat - : Pat<(ResType (load eP:$rs1)), (Inst eP:$rs1, (i20 0))>; - // 1024-bit vector load -class VectorLdPat1024 +class VectorLdPat1024 : Pat<(Rtype (load eP:$rs1)), (INSERT_SUBREG (INSERT_SUBREG diff --git a/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp b/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp index de1886db5e51..62616639c977 100644 --- a/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp +++ b/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp @@ -3265,12 +3265,12 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( unsigned RBID = deriveRegBankID(I.getOperand(0).getReg(), MRI, RBI); if (RBID == AIE2::AccRegBankID) { return {/*ISelOpcode=*/AIE2::VST_dmw_sts_am_ag_idx_imm, - /*FitsImmediateRange=*/true, + AlwaysFitsImmediateRange, /*OffsetOpcode=*/AIE2::VST_dmw_sts_am_ag_idx_imm}; } if (RBID == AIE2::VRegBankID) { return {/*ISelOpcode=*/AIE2::VST_dmw_sts_w_ag_idx_imm, - /*FitsImmediateRange=*/true, + AlwaysFitsImmediateRange, /*OffsetOpcode=*/AIE2::VST_dmw_sts_w_ag_idx_imm}; } llvm_unreachable("Vector type not in AccRegBank nor VRegBank"); @@ -3497,13 +3497,13 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( unsigned RBID = deriveRegBankID(I.getOperand(0).getReg(), MRI, RBI); if (RBID == AIE2::AccRegBankID) { return {/*ISelOpcode=*/AIE2::VLDA_dmw_lda_am_ag_idx_imm, - /*FitsImmediateRange=*/true, + AlwaysFitsImmediateRange, /*OffsetOpcode=*/AIE2::VLDA_dmw_lda_am_ag_idx_imm}; } if (RBID == AIE2::VRegBankID) { - return {/*ISelOpcode=*/AIE2::VLDA_dmw_lda_w_ag_idx_imm, - /*FitsImmediateRange=*/true, - /*OffsetOpcode=*/AIE2::VLDA_dmw_lda_w_ag_idx_imm}; + return {/*ISelOpcode=*/AIE2::VLD_idx_imm_3x32_pseudo, + AlwaysFitsImmediateRange, + /*OffsetOpcode=*/AIE2::VLD_idx_imm_3x32_pseudo}; } llvm_unreachable("Vector type not in AccRegBank nor VRegBank"); } @@ -3527,9 +3527,9 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( /*OffsetOpcode=*/AIE2::VLDA_dmw_lda_am_ag_idx_imm}; } if (RBID == AIE2::VRegBankID) { - return {/*ISelOpcode=*/AIE2::VLDA_dmw_lda_w_ag_idx_imm, + return {/*ISelOpcode=*/AIE2::VLD_idx_imm_3x32_pseudo, AlwaysFitsImmediateRange, - /*OffsetOpcode=*/AIE2::VLDA_dmw_lda_w_ag_idx_imm}; + /*OffsetOpcode=*/{}}; } llvm_unreachable("Vector type not in AccRegBank nor VRegBank"); } @@ -3557,10 +3557,30 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( /*OffsetOpcode=*/AIE2::VLDA_dmw_lda_am_ag_idx_imm}; } if (RBID == AIE2::VRegBankID) { - FitsImmediateRange = checkImmediateRangeSplitting<11, 32, 32>(Offset); - return {/*ISelOpcode=*/AIE2::VLDA_dmw_lda_w_ag_idx_imm, - FitsImmediateRange, - /*OffsetOpcode=*/AIE2::VLDA_dmw_lda_w_ag_idx_imm}; + unsigned OffsetOpcode; + // First try if the Instruction can be selected as multi-slot offset + // load + if (checkImmediateRangeSplitting<8, 32, 32>(Offset)) { + FitsImmediateRange = true; + ISelOpcode = OffsetOpcode = AIE2::VLD_idx_imm_3x32_pseudo; + } else if (checkImmediateRange<8, 32>(Offset)) { + // When Offset is positive and one of the offset is in range of SlotB + ISelOpcode = AIE2::VLD_idx_imm_3x32_pseudo; + OffsetOpcode = AIE2::VLDA_dmw_lda_w_ag_idx_imm; + FitsImmediateRange = true; + } else if (Offset.has_value() && (*Offset).isNegative() && + checkImmediateRange<8, 32>((*Offset) + 32)) { + // When Offset is negative and one of the offset is in range of SlotB + ISelOpcode = AIE2::VLDA_dmw_lda_w_ag_idx_imm; + OffsetOpcode = AIE2::VLD_idx_imm_3x32_pseudo; + FitsImmediateRange = true; + } else { + // When Offset & Offset+32 are out of range of SlotB + FitsImmediateRange = checkImmediateRangeSplitting<11, 32, 32>(Offset); + ISelOpcode = OffsetOpcode = AIE2::VLDA_dmw_lda_w_ag_idx_imm; + } + return {/*ISelOpcode=*/ISelOpcode, FitsImmediateRange, + /*OffsetOpcode=*/OffsetOpcode}; } llvm_unreachable("Vector type not in AccRegBank nor VRegBank"); } @@ -3574,11 +3594,18 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( /*OffsetOpcode=*/AIE2::VLDA_dmw_lda_am_ag_idx_imm}; } if (RBID == AIE2::VRegBankID) { - FitsImmediateRange = checkImmediateRange<11, 32>(Offset); - ISelOpcode = FitsImmediateRange ? AIE2::VLDA_dmw_lda_w_ag_idx_imm - : AIE2::VLDA_dmw_lda_w_ag_idx; + // First try if the Instruction can be selected as multi-slot offset + // load + if (checkImmediateRange<8, 32>(Offset)) { + FitsImmediateRange = true; + ISelOpcode = AIE2::VLD_idx_imm_3x32_pseudo; + } else { + FitsImmediateRange = checkImmediateRange<11, 32>(Offset); + ISelOpcode = FitsImmediateRange ? AIE2::VLDA_dmw_lda_w_ag_idx_imm + : AIE2::VLD_idx_pseudo; + } return {ISelOpcode, FitsImmediateRange, - /*OffsetOpcode=*/AIE2::VLDA_dmw_lda_w_ag_idx_imm}; + /*OffsetOpcode=*/{}}; } llvm_unreachable("Vector type not in AccRegBank nor VRegBank"); } @@ -3635,8 +3662,8 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( return {/*ISelOpcode=*/AIE2::VLDA_2D_dmw_lda_am, NoImmediate, /*OffsetOpcode=*/AIE2::VLDA_dmw_lda_am_ag_idx_imm}; if (RBID == AIE2::VRegBankID) - return {/*ISelOpcode=*/AIE2::VLDA_2D_dmw_lda_w, NoImmediate, - /*OffsetOpcode=*/AIE2::VLDA_dmw_lda_w_ag_idx_imm}; + return {/*ISelOpcode=*/AIE2::VLD_2D_pseudo, NoImmediate, + /*OffsetOpcode=*/AIE2::VLD_idx_imm_3x32_pseudo}; llvm_unreachable("Vector type not in AccRegBank nor VRegBank"); } if (getLoadStoreSize(I) == 128) { @@ -3678,8 +3705,8 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( return {/*ISelOpcode=*/AIE2::VLDA_3D_dmw_lda_am, NoImmediate, /*OffsetOpcode=*/AIE2::VLDA_dmw_lda_am_ag_idx_imm}; if (RBID == AIE2::VRegBankID) - return {/*ISelOpcode=*/AIE2::VLDA_3D_dmw_lda_w, NoImmediate, - /*OffsetOpcode=*/AIE2::VLDA_dmw_lda_w_ag_idx_imm}; + return {/*ISelOpcode=*/AIE2::VLD_3D_pseudo, NoImmediate, + /*OffsetOpcode=*/AIE2::VLD_idx_imm_3x32_pseudo}; llvm_unreachable("Vector type not in AccRegBank nor VRegBank"); } if (getLoadStoreSize(I) == 128) { @@ -3725,11 +3752,18 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( /*OffsetOpcode=*/AIE2::VLDA_dmw_lda_am_ag_idx_imm}; } if (RBID == AIE2::VRegBankID) { - FitsImmediateRange = checkImmediateRange<12, 32>(Offset); - ISelOpcode = FitsImmediateRange ? AIE2::VLDA_dmw_lda_w_ag_pstm_nrm_imm - : AIE2::VLDA_dmw_lda_w_ag_pstm_nrm; + // First try if the Instruction can be selected as multi-slot offset + // load + if (checkImmediateRange<9, 32>(Offset)) { + FitsImmediateRange = true; + ISelOpcode = AIE2::VLD_pstm_imm_4x32_pseudo; + } else { + FitsImmediateRange = checkImmediateRange<12, 32>(Offset); + ISelOpcode = FitsImmediateRange ? AIE2::VLDA_dmw_lda_w_ag_pstm_nrm_imm + : AIE2::VLD_pstm_pseudo; + } return {ISelOpcode, FitsImmediateRange, - /*OffsetOpcode=*/AIE2::VLDA_dmw_lda_w_ag_idx_imm}; + /*OffsetOpcode=*/AIE2::VLD_idx_imm_3x32_pseudo}; } llvm_unreachable("Vector type not in AccRegBank nor VRegBank"); } diff --git a/llvm/lib/Target/AIE/AIE2MultiSlotPseudoInstrInfo.td b/llvm/lib/Target/AIE/AIE2MultiSlotPseudoInstrInfo.td index 38f15bebacd1..4092e75aac77 100644 --- a/llvm/lib/Target/AIE/AIE2MultiSlotPseudoInstrInfo.td +++ b/llvm/lib/Target/AIE/AIE2MultiSlotPseudoInstrInfo.td @@ -12,17 +12,23 @@ // The last parameter to the MulitSlot_Pseudo is instruction into which the Multi-Slot Pseudo could be materialize to. // Priority to Slot/Instruction is based on the sequence in which the real instructions are passed +// Note : The itineraries of all instructions covered by a multi-slot pseudo should be equivalent. +// We need to make sure the operand latency is same for all the possible instructions. + // Multi-Slot Pseudo PADD let mayLoad = false, mayStore = false, hasSideEffects = false, Itinerary = II_PADD in { let Constraints = "$ptr = $res" in { def PADD_mod_pseudo : MultiSlot_Pseudo< (outs eP:$res), (ins eP:$ptr, eM:$mod), - "padd_mod_pseudo", "[$ptr], $mod", [PADDB_ldb_ptr_inc_nospill_nrm, PADDA_lda_ptr_inc_idx] >; + "padd_mod_pseudo", "[$ptr], $mod", + [PADDB_ldb_ptr_inc_nospill_nrm, PADDA_lda_ptr_inc_idx] >; def PADD_imm_pseudo : MultiSlot_Pseudo< (outs eP:$res), (ins eP:$ptr, imm9x4:$imm), - "padd_imm_pseudo", "[$ptr], $imm", [PADDB_ldb_ptr_inc_nrm_imm, PADDA_lda_ptr_inc_idx_imm] >; + "padd_imm_pseudo", "[$ptr], $imm", + [PADDB_ldb_ptr_inc_nrm_imm, PADDA_lda_ptr_inc_idx_imm] >; } let Defs = [SP], Uses = [SP] in { def PADD_sp_imm_pseudo : MultiSlot_Pseudo< (outs ), (ins imm12x32:$imm), - "padd_sp_imm_pseudo", "[sp], $imm", [PADDB_sp_imm, PADDA_sp_imm] >; + "padd_sp_imm_pseudo", "[sp], $imm", + [PADDB_sp_imm, PADDA_sp_imm] >; } } @@ -31,23 +37,68 @@ let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1, Itinerary = II_ hasSideEffects = false, mayLoad = false, mayStore = false in { // To move 10bit imm to R/lc type reg. - def MOV_RLC_imm10_pseudo : MultiSlot_Pseudo< (outs eRLC:$dst) , (ins simm10:$i), - "mov_rlc_imm10_pseudo ", "$dst, $i", [MOVA_lda_cg, MOVX_alu_cg, MOV_mv_cg, MOVXM_lng_cg]>; + def MOV_RLC_imm10_pseudo : MultiSlot_Pseudo< (outs eRLC:$dst), (ins simm10:$i), + "mov_rlc_imm10_pseudo ", "$dst, $i", + [MOVA_lda_cg, MOVX_alu_cg, MOV_mv_cg, MOVXM_lng_cg]>; // To move 10bit imm to P/D type reg. - def MOV_PD_imm10_pseudo : MultiSlot_Pseudo< (outs ePmDm:$dst) , (ins simm10:$i), - "mov_pd_imm10_pseudo ", "$dst, $i", [MOVA_lda_cg, MOV_mv_cg, MOVXM_lng_cg]>; + def MOV_PD_imm10_pseudo : MultiSlot_Pseudo< (outs ePmDm:$dst), (ins simm10:$i), + "mov_pd_imm10_pseudo ", "$dst, $i", + [MOVA_lda_cg, MOV_mv_cg, MOVXM_lng_cg]>; // To move 10bit imm to S type reg. - def MOV_S_imm10_pseudo : MultiSlot_Pseudo< (outs eS:$dst) , (ins simm10:$i), - "mov_s_imm10_pseudo ", "$dst, $i", [MOV_mv_cg, MOVXM_lng_cg]>; + def MOV_S_imm10_pseudo : MultiSlot_Pseudo< (outs eS:$dst), (ins simm10:$i), + "mov_s_imm10_pseudo ", "$dst, $i", + [MOV_mv_cg, MOVXM_lng_cg]>; // To move 10bit imm to Any type reg. (try to use the above first before using the following) - def MOV_scalar_imm10_pseudo : MultiSlot_Pseudo< (outs mMvSclDst:$dst) , (ins simm10:$i), - "mov_scalar_imm10_pseudo ", "$dst, $i", [MOV_mv_cg, MOVXM_lng_cg]>; + def MOV_scalar_imm10_pseudo : MultiSlot_Pseudo< (outs mMvSclDst:$dst), (ins simm10:$i), + "mov_scalar_imm10_pseudo ", "$dst, $i", + [MOV_mv_cg, MOVXM_lng_cg]>; // To move 11bit imm to R/lc type reg. - def MOV_RLC_imm11_pseudo : MultiSlot_Pseudo< (outs eRLC:$dst) , (ins simm11:$i), - "mov_rlc_imm11_pseudo ", "$dst, $i", [MOVA_lda_cg, MOVX_alu_cg, MOVXM_lng_cg]>; + def MOV_RLC_imm11_pseudo : MultiSlot_Pseudo< (outs eRLC:$dst), (ins simm11:$i), + "mov_rlc_imm11_pseudo ", "$dst, $i", + [MOVA_lda_cg, MOVX_alu_cg, MOVXM_lng_cg]>; // To move 11bit imm to P/D type reg. - def MOV_PD_imm11_pseudo : MultiSlot_Pseudo< (outs ePmDm:$dst) , (ins simm11:$i), - "mov_PD_imm11_pseudo ", "$dst, $i", [MOVA_lda_cg, MOVXM_lng_cg]>; + def MOV_PD_imm11_pseudo : MultiSlot_Pseudo< (outs ePmDm:$dst), (ins simm11:$i), + "mov_PD_imm11_pseudo ", "$dst, $i", + [MOVA_lda_cg, MOVXM_lng_cg]>; +} + +// Pseudo VLD +let hasSideEffects = false, mayLoad = true, mayStore = false in { + let Itinerary = II_VLDA_W in { + def VLD_idx_pseudo : MultiSlot_Pseudo< (outs mWa:$dst), + (ins eP:$ptr, eDJ:$dj), + "vld_idx_pseudo", "$dst, [$ptr, $dj]", + [VLDB_dmw_ldb_ag_idx, VLDA_dmw_lda_w_ag_idx]>; + def VLD_idx_imm_3x32_pseudo : MultiSlot_Pseudo< (outs mWa:$dst), + (ins eP:$ptr, imm3x32:$imm), + "vld_idx_imm_imm3x32_pseudo", "$dst, [$ptr, $imm]", + [VLDB_dmw_ldb_ag_idx_imm, VLDA_dmw_lda_w_ag_idx_imm]>; + } + let Itinerary = II_VLDA_POSTINC_W in + let Constraints = "$ptr_out = $ptr" in { + def VLD_pstm_pseudo : MultiSlot_Pseudo< (outs mWa:$dst, eP:$ptr_out), + (ins eP:$ptr, eM:$mod), + "vld_pstm_pseudo", "$dst, [$ptr], $mod", + [VLDB_dmw_ldb_ag_pstm_nrm, VLDA_dmw_lda_w_ag_pstm_nrm]>; + def VLD_pstm_imm_4x32_pseudo : MultiSlot_Pseudo< (outs mWa:$dst, eP:$ptr_out), + (ins eP:$ptr, imm4x32:$imm), + "vld_pstm_imm_4x32_pseudo", "$dst, [$ptr], $imm", + [VLDB_dmw_ldb_ag_pstm_nrm_imm, VLDA_dmw_lda_w_ag_pstm_nrm_imm]>; + } + let Itinerary = II_VLDA_2D_W in + let Constraints = "$ptr_out = $ptr" in { + def VLD_2D_pseudo : MultiSlot_Pseudo< (outs mWa:$dst, eP:$ptr_out, eDC:$count_out), + (ins eP:$ptr, eD:$mod), + "vld.2d_pseudo", "$dst, [$ptr], $mod", + [VLDB_2D, VLDA_2D_dmw_lda_w]>; + } + let Itinerary = II_VLDA_3D_W in + let Constraints = "$ptr_out = $ptr" in { + def VLD_3D_pseudo : MultiSlot_Pseudo< (outs mWa:$dst, eP:$ptr_out, eDC:$count_lo_out, eDC:$count_hi_out), + (ins eP:$ptr, eDS:$mod), + "vld.3d_pseudo", "$dst, [$ptr], $mod", + [VLDB_3D, VLDA_3D_dmw_lda_w]>; + } } diff --git a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp index 11dcca57c280..63fee9c4f2c3 100644 --- a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp +++ b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp @@ -134,6 +134,10 @@ bool AIEResourceCycle::canReserveResources(MachineInstr &MI) { if (!AlternateInsts) return Bundle.canAdd(&MI); + // Limit VLD multislot instructions to be NFC for the SW pipeliner. + if (MI.mayLoad()) + return Bundle.canAdd(AlternateInsts->back()); + return any_of(*AlternateInsts, [&](unsigned AltOpcode) { return Bundle.canAdd(AltOpcode); }); } @@ -145,6 +149,10 @@ void AIEResourceCycle::reserveResources(MachineInstr &MI) { if (!AlternateInsts) return Bundle.add(&MI); + // Limit VLD multislot instructions to be NFC for the SW pipeliner. + if (MI.mayLoad()) + return Bundle.add(&MI, AlternateInsts->back()); + for (unsigned AltOpcode : *AlternateInsts) { if (Bundle.canAdd(AltOpcode)) { return Bundle.add(&MI, AltOpcode); diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/indexed-512-load-store.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/indexed-512-load-store.mir index a9db927a27d6..9e279d3d38b5 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/indexed-512-load-store.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/indexed-512-load-store.mir @@ -16,13 +16,13 @@ # TODO: This can and should be avoided! --- -name: load +name: load_offset_not_32_step legalized: true tracksRegLiveness: true body: | bb.0: liveins: $p0 - ; CHECK-LABEL: name: load + ; CHECK-LABEL: name: load_offset_not_32_step ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 @@ -35,8 +35,8 @@ body: | ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[PADD_mod_pseudo1]], 32 :: (load (<16 x s16>) from unknown-address + 32) ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm1:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[PADD_mod_pseudo1]], 0 :: (load (<16 x s16>), align 64) ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_idx_imm1]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx [[PADD_mod_pseudo]], [[COPY1]] :: (load (<16 x s16>)) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PADD_mod_pseudo]], implicit [[REG_SEQUENCE]], implicit [[VLDA_dmw_lda_w_ag_idx]] + ; CHECK-NEXT: [[VLD_idx_pseudo:%[0-9]+]]:vec256 = VLD_idx_pseudo [[PADD_mod_pseudo]], [[COPY1]] :: (load (<16 x s16>)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PADD_mod_pseudo]], implicit [[REG_SEQUENCE]], implicit [[VLD_idx_pseudo]] %0:_(p0) = COPY $p0 %1:_(s32) = G_CONSTANT i32 24 %2:_(s20) = G_TRUNC %1 @@ -48,13 +48,42 @@ body: | ... --- -name: store +name: load_offset_is_32_step +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0 + ; CHECK-LABEL: name: load_offset_is_32_step + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[MOV_RLC_imm10_pseudo:%[0-9]+]]:er = MOV_RLC_imm10_pseudo 64 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:em = COPY [[MOV_RLC_imm10_pseudo]] + ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 96 :: (load (<16 x s16>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 64 :: (load (<16 x s16>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_idx_imm_3x32_pseudo1]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo2:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[PADD_mod_pseudo]], 64 :: (load (<16 x s16>)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PADD_mod_pseudo]], implicit [[REG_SEQUENCE]], implicit [[VLD_idx_imm_3x32_pseudo2]] + %0:_(p0) = COPY $p0 + %1:_(s32) = G_CONSTANT i32 64 + %2:_(s20) = G_TRUNC %1 + %3:_(p0) = G_PTR_ADD %0, %2 + %4:_(<32 x s16>) = G_LOAD %3(p0) :: (load (<32 x s16>)) + %5:_(p0) = G_PTR_ADD %3, %2 + %6:_(<16 x s16>) = G_LOAD %5(p0) :: (load (<16 x s16>)) + PseudoRET implicit $lr, implicit %3, implicit %4, implicit %6 +... + +--- +name: store_offset_not_32_step legalized: true tracksRegLiveness: true body: | bb.0: liveins: $p0, $x0, $wl2 - ; CHECK-LABEL: name: store + ; CHECK-LABEL: name: store_offset_not_32_step ; CHECK: liveins: $p0, $x0, $wl2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 @@ -83,3 +112,37 @@ body: | G_STORE %2, %6(p0) :: (store (<16 x s16>)) PseudoRET implicit $lr, implicit %5 ... + +--- +name: store_offset_is_32_step +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0, $x0, $wl2 + ; CHECK-LABEL: name: store_offset_is_32_step + ; CHECK: liveins: $p0, $x0, $wl2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = COPY $x0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec256 = COPY $wl2 + ; CHECK-NEXT: [[MOV_RLC_imm10_pseudo:%[0-9]+]]:er = MOV_RLC_imm10_pseudo 64 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:em = COPY [[MOV_RLC_imm10_pseudo]] + ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[COPY3]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vec256 = COPY [[COPY1]].sub_256_lo + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vec256 = COPY [[COPY1]].sub_256_hi + ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY5]], [[COPY]], 96 :: (store (<16 x s16>) into unknown-address + 32) + ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY4]], [[COPY]], 64 :: (store (<16 x s16>), align 64) + ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY2]], [[PADD_mod_pseudo]], 64 :: (store (<16 x s16>)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PADD_mod_pseudo]] + %0:_(p0) = COPY $p0 + %1:_(<32 x s16>) = COPY $x0 + %2:_(<16 x s16>) = COPY $wl2 + %3:_(s32) = G_CONSTANT i32 64 + %4:_(s20) = G_TRUNC %3 + %5:_(p0) = G_PTR_ADD %0, %4 + G_STORE %1, %5(p0) :: (store (<32 x s16>)) + %6:_(p0) = G_PTR_ADD %5, %4 + G_STORE %2, %6(p0) :: (store (<16 x s16>)) + PseudoRET implicit $lr, implicit %5 +... diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-indexed-load-store.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-indexed-load-store.mir index cee216290ba3..6f5e626abbb7 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-indexed-load-store.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-indexed-load-store.mir @@ -8,14 +8,14 @@ # RUN: llc -mtriple aie2 -run-pass=instruction-select %s -verify-machineinstrs -o - | FileCheck %s --- -name: VEC256_LOAD_maxOffset +name: VEC256_LOAD_maxOffset_slotA alignment: 16 legalized: true regBankSelected: true body: | bb.1.entry: liveins: $p0 - ; CHECK-LABEL: name: VEC256_LOAD_maxOffset + ; CHECK-LABEL: name: VEC256_LOAD_maxOffset_slotA ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 @@ -28,14 +28,14 @@ body: | ... --- -name: VEC256_LOAD_minOffset +name: VEC256_LOAD_minOffset_slotA alignment: 16 legalized: true regBankSelected: true body: | bb.1.entry: liveins: $p0 - ; CHECK-LABEL: name: VEC256_LOAD_minOffset + ; CHECK-LABEL: name: VEC256_LOAD_minOffset_slotA ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 @@ -48,20 +48,60 @@ body: | ... --- -name: VEC256_LOAD_overMaxOffset +name: VEC256_LOAD_maxOffset_slotB alignment: 16 legalized: true regBankSelected: true body: | bb.1.entry: liveins: $p0 - ; CHECK-LABEL: name: VEC256_LOAD_overMaxOffset + ; CHECK-LABEL: name: VEC256_LOAD_maxOffset_slotB + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 96 :: (load (<8 x s32>)) + ; CHECK-NEXT: $wl0 = COPY [[VLD_idx_imm_3x32_pseudo]] + %1:ptrregbank(p0) = COPY $p0 + %2:modregbank(s20) = G_CONSTANT i20 96 + %0:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<8 x s32>)) + $wl0 = COPY %0:vregbank(<8 x s32>) +... + +--- +name: VEC256_LOAD_minOffset_slotB +alignment: 16 +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0 + ; CHECK-LABEL: name: VEC256_LOAD_minOffset_slotB + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], -128 :: (load (<8 x s32>)) + ; CHECK-NEXT: $wl0 = COPY [[VLD_idx_imm_3x32_pseudo]] + %1:ptrregbank(p0) = COPY $p0 + %2:modregbank(s20) = G_CONSTANT i20 -128 + %0:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<8 x s32>)) + $wl0 = COPY %0:vregbank(<8 x s32>) +... + +--- +name: VEC256_LOAD_overMaxOffset_slotA +alignment: 16 +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0 + ; CHECK-LABEL: name: VEC256_LOAD_overMaxOffset_slotA ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[MOVXM_lng_cg:%[0-9]+]]:edj_as_32bit = MOVXM_lng_cg 1024 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx [[COPY]], [[MOVXM_lng_cg]] :: (load (<8 x s32>)) - ; CHECK-NEXT: $wl0 = COPY [[VLDA_dmw_lda_w_ag_idx]] + ; CHECK-NEXT: [[VLD_idx_pseudo:%[0-9]+]]:vec256 = VLD_idx_pseudo [[COPY]], [[MOVXM_lng_cg]] :: (load (<8 x s32>)) + ; CHECK-NEXT: $wl0 = COPY [[VLD_idx_pseudo]] %1:ptrregbank(p0) = COPY $p0 %2:modregbank(s20) = G_CONSTANT i20 1024 %0:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<8 x s32>)) @@ -69,20 +109,20 @@ body: | ... --- -name: VEC256_LOAD_overMinOffset +name: VEC256_LOAD_belowMinOffset_slotA alignment: 16 legalized: true regBankSelected: true body: | bb.1.entry: liveins: $p0 - ; CHECK-LABEL: name: VEC256_LOAD_overMinOffset + ; CHECK-LABEL: name: VEC256_LOAD_belowMinOffset_slotA ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[MOVXM_lng_cg:%[0-9]+]]:edj_as_32bit = MOVXM_lng_cg -1056 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx [[COPY]], [[MOVXM_lng_cg]] :: (load (<8 x s32>)) - ; CHECK-NEXT: $wl0 = COPY [[VLDA_dmw_lda_w_ag_idx]] + ; CHECK-NEXT: [[VLD_idx_pseudo:%[0-9]+]]:vec256 = VLD_idx_pseudo [[COPY]], [[MOVXM_lng_cg]] :: (load (<8 x s32>)) + ; CHECK-NEXT: $wl0 = COPY [[VLD_idx_pseudo]] %1:ptrregbank(p0) = COPY $p0 %2:modregbank(s20) = G_CONSTANT i20 -1056 %0:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<8 x s32>)) @@ -90,14 +130,54 @@ body: | ... --- -name: VEC512_LOAD_maxOffset +name: VEC256_LOAD_overMaxOffset_slotB +alignment: 16 +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0 + ; CHECK-LABEL: name: VEC256_LOAD_overMaxOffset_slotB + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 128 :: (load (<8 x s32>)) + ; CHECK-NEXT: $wl0 = COPY [[VLDA_dmw_lda_w_ag_idx_imm]] + %1:ptrregbank(p0) = COPY $p0 + %2:modregbank(s20) = G_CONSTANT i20 128 + %0:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<8 x s32>)) + $wl0 = COPY %0:vregbank(<8 x s32>) +... + +--- +name: VEC256_LOAD_belowMinOffset_slotB +alignment: 16 +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0 + ; CHECK-LABEL: name: VEC256_LOAD_belowMinOffset_slotB + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], -160 :: (load (<8 x s32>)) + ; CHECK-NEXT: $wl0 = COPY [[VLDA_dmw_lda_w_ag_idx_imm]] + %1:ptrregbank(p0) = COPY $p0 + %2:modregbank(s20) = G_CONSTANT i20 -160 + %0:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<8 x s32>)) + $wl0 = COPY %0:vregbank(<8 x s32>) +... + +--- +name: VEC512_LOAD_maxOffset_slotA alignment: 16 legalized: true regBankSelected: true body: | bb.1.entry: liveins: $p0 - ; CHECK-LABEL: name: VEC512_LOAD_maxOffset + ; CHECK-LABEL: name: VEC512_LOAD_maxOffset_slotA ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 @@ -112,14 +192,14 @@ body: | ... --- -name: VEC512_LOAD_minOffset +name: VEC512_LOAD_minOffset_slotA alignment: 16 legalized: true regBankSelected: true body: | bb.1.entry: liveins: $p0 - ; CHECK-LABEL: name: VEC512_LOAD_minOffset + ; CHECK-LABEL: name: VEC512_LOAD_minOffset_slotA ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 @@ -134,14 +214,14 @@ body: | ... --- -name: VEC512_LOAD_overMaxOffset +name: VEC512_LOAD_overMaxOffset_slotA alignment: 16 legalized: true regBankSelected: true body: | bb.1.entry: liveins: $p0 - ; CHECK-LABEL: name: VEC512_LOAD_overMaxOffset + ; CHECK-LABEL: name: VEC512_LOAD_overMaxOffset_slotA ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 @@ -157,14 +237,14 @@ body: | ... --- -name: VEC512_LOAD_overMinOffset +name: VEC512_LOAD_belowMinOffset_slotA alignment: 16 legalized: true regBankSelected: true body: | bb.1.entry: liveins: $p0 - ; CHECK-LABEL: name: VEC512_LOAD_overMinOffset + ; CHECK-LABEL: name: VEC512_LOAD_belowMinOffset_slotA ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 @@ -179,6 +259,116 @@ body: | $x0 = COPY %0:vregbank(<16 x s32>) ... +--- +name: VEC512_LOAD_maxOffset_slotB +alignment: 16 +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0 + ; CHECK-LABEL: name: VEC512_LOAD_maxOffset_slotB + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 128 :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 96 :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi + ; CHECK-NEXT: $x0 = COPY [[REG_SEQUENCE]] + %1:ptrregbank(p0) = COPY $p0 + %2:modregbank(s20) = G_CONSTANT i20 96 + %0:vregbank(<16 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<16 x s32>)) + $x0 = COPY %0:vregbank(<16 x s32>) +... + +--- +name: VEC512_LOAD_minOffset_slotB +alignment: 16 +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0 + ; CHECK-LABEL: name: VEC512_LOAD_minOffset_slotB + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], -96 :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], -128 :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_idx_imm_3x32_pseudo1]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: $x0 = COPY [[REG_SEQUENCE]] + %1:ptrregbank(p0) = COPY $p0 + %2:modregbank(s20) = G_CONSTANT i20 -128 + %0:vregbank(<16 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<16 x s32>)) + $x0 = COPY %0:vregbank(<16 x s32>) +... + +--- +name: VEC512_LOAD_overMaxOffset_slotB +alignment: 16 +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0 + ; CHECK-LABEL: name: VEC512_LOAD_overMaxOffset_slotB + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 160 :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm1:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 128 :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_idx_imm1]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi + ; CHECK-NEXT: $x0 = COPY [[REG_SEQUENCE]] + %1:ptrregbank(p0) = COPY $p0 + %2:modregbank(s20) = G_CONSTANT i20 128 + %0:vregbank(<16 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<16 x s32>)) + $x0 = COPY %0:vregbank(<16 x s32>) +... + +--- +name: VEC512_LOAD_belowMinOffset_slotB +alignment: 16 +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0 + ; CHECK-LABEL: name: VEC512_LOAD_belowMinOffset_slotB + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], -128 :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], -160 :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: $x0 = COPY [[REG_SEQUENCE]] + %1:ptrregbank(p0) = COPY $p0 + %2:modregbank(s20) = G_CONSTANT i20 -160 + %0:vregbank(<16 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<16 x s32>)) + $x0 = COPY %0:vregbank(<16 x s32>) +... + +--- +name: VEC512_LOAD_maxOffset-32_slotB +alignment: 16 +legalized: true +regBankSelected: true +body: | + bb.1.entry: + liveins: $p0 + ; CHECK-LABEL: name: VEC512_LOAD_maxOffset-32_slotB + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 96 :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 64 :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_idx_imm_3x32_pseudo1]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: $x0 = COPY [[REG_SEQUENCE]] + %1:ptrregbank(p0) = COPY $p0 + %2:modregbank(s20) = G_CONSTANT i20 64 + %0:vregbank(<16 x s32>) = G_AIE_OFFSET_LOAD %1:ptrregbank(p0), %2:modregbank(s20) :: (load (<16 x s32>)) + $x0 = COPY %0:vregbank(<16 x s32>) +... + --- name: ACC256_LOAD_maxOffset alignment: 16 @@ -241,14 +431,14 @@ body: | ... --- -name: ACC256_LOAD_overMinOffset +name: ACC256_LOAD_belowMinOffset alignment: 16 legalized: true regBankSelected: true body: | bb.1.entry: liveins: $p0 - ; CHECK-LABEL: name: ACC256_LOAD_overMinOffset + ; CHECK-LABEL: name: ACC256_LOAD_belowMinOffset ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 @@ -329,14 +519,14 @@ body: | ... --- -name: ACC512_LOAD_overMinOffset +name: ACC512_LOAD_belowMinOffset alignment: 16 legalized: true regBankSelected: true body: | bb.1.entry: liveins: $p0 - ; CHECK-LABEL: name: ACC512_LOAD_overMinOffset + ; CHECK-LABEL: name: ACC512_LOAD_belowMinOffset ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 @@ -413,14 +603,14 @@ body: | ... --- -name: VEC256_STORE_overMinOffset +name: VEC256_STORE_belowMinOffset alignment: 16 legalized: true regBankSelected: true body: | bb.1.entry: liveins: $p0, $wl0 - ; CHECK-LABEL: name: VEC256_STORE_overMinOffset + ; CHECK-LABEL: name: VEC256_STORE_belowMinOffset ; CHECK: liveins: $p0, $wl0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec256 = COPY $wl0 @@ -504,14 +694,14 @@ body: | ... --- -name: VEC512_STORE_overMinOffset +name: VEC512_STORE_belowMinOffset alignment: 16 legalized: true regBankSelected: true body: | bb.1.entry: liveins: $p0, $x0 - ; CHECK-LABEL: name: VEC512_STORE_overMinOffset + ; CHECK-LABEL: name: VEC512_STORE_belowMinOffset ; CHECK: liveins: $p0, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0 @@ -589,14 +779,14 @@ body: | ... --- -name: ACC256_STORE_overMinOffset +name: ACC256_STORE_belowMinOffset alignment: 16 legalized: true regBankSelected: true body: | bb.1.entry: liveins: $amll0, $p0 - ; CHECK-LABEL: name: ACC256_STORE_overMinOffset + ; CHECK-LABEL: name: ACC256_STORE_belowMinOffset ; CHECK: liveins: $amll0, $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:acc256 = COPY $amll0 @@ -680,14 +870,14 @@ body: | ... --- -name: ACC512_STORE_overMinOffset +name: ACC512_STORE_belowMinOffset alignment: 16 legalized: true regBankSelected: true body: | bb.1.entry: liveins: $bml0, $p0 - ; CHECK-LABEL: name: ACC512_STORE_overMinOffset + ; CHECK-LABEL: name: ACC512_STORE_belowMinOffset ; CHECK: liveins: $bml0, $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:acc512 = COPY $bml0 diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-no-combine-vldb_unpack.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-no-combine-vldb_unpack.mir index 54e66ecb25a3..fd5b49611cc4 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-no-combine-vldb_unpack.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-no-combine-vldb_unpack.mir @@ -23,10 +23,10 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm [[COPY]], [[COPY2]] :: (load (<32 x s8>)) - ; CHECK-NEXT: $m1 = COPY [[VLDA_dmw_lda_w_ag_pstm_nrm1]] + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY2]] :: (load (<32 x s8>)) + ; CHECK-NEXT: $m1 = COPY [[VLD_pstm_pseudo1]] ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ep = COPY $m1 - ; CHECK-NEXT: [[VUNPACK_S16_S8_:%[0-9]+]]:vec512 = VUNPACK_S16_S8 [[VLDA_dmw_lda_w_ag_pstm_nrm]] + ; CHECK-NEXT: [[VUNPACK_S16_S8_:%[0-9]+]]:vec512 = VUNPACK_S16_S8 [[VLD_pstm_pseudo]] ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VUNPACK_S16_S8_]], implicit [[COPY3]] %0:ptrregbank(p0) = COPY $p0 %1:gprregbank(s32) = COPY $r0 @@ -56,9 +56,9 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:er = COPY $r0 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:em = COPY [[COPY2]] ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vec256 = COPY $amll0 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm [[COPY]], [[COPY3]] :: (load (<32 x s8>)) + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY3]] :: (load (<32 x s8>)) ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY4]], [[COPY1]], 0 :: (store (<32 x s8>)) - ; CHECK-NEXT: [[VUNPACK_S16_S8_:%[0-9]+]]:vec512 = VUNPACK_S16_S8 [[VLDA_dmw_lda_w_ag_pstm_nrm]] + ; CHECK-NEXT: [[VUNPACK_S16_S8_:%[0-9]+]]:vec512 = VUNPACK_S16_S8 [[VLD_pstm_pseudo]] ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VUNPACK_S16_S8_]] %0:ptrregbank(p0) = COPY $p0 %20:ptrregbank(p0) = COPY $p1 diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-no-combine.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-no-combine.mir index ab3597aadc65..381c4e951c85 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-no-combine.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-no-combine.mir @@ -52,8 +52,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[PADD_imm_pseudo:%[0-9]+]]:ep = PADD_imm_pseudo [[COPY]], 992 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[PADD_imm_pseudo]], 0 :: (load (<8 x s32>)) - ; CHECK-NEXT: $wl0 = COPY [[VLDA_dmw_lda_w_ag_idx_imm]] + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[PADD_imm_pseudo]], 0 :: (load (<8 x s32>)) + ; CHECK-NEXT: $wl0 = COPY [[VLD_idx_imm_3x32_pseudo]] %1:ptrregbank(p0) = COPY $p0 %2:modregbank(s20) = G_CONSTANT i20 992 %3:ptrregbank(p0) = G_PTR_ADD %1:ptrregbank, %2:modregbank(s20) diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-postinc-2d-vlda_ups.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-postinc-2d-vlda_ups.mir index 591ac82bf70c..11d15b0f9291 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-postinc-2d-vlda_ups.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-postinc-2d-vlda_ups.mir @@ -684,11 +684,11 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm10_pseudo2:%[0-9]+]]:edn = MOV_PD_imm10_pseudo 3 ; CHECK-NEXT: [[MOV_PD_imm10_pseudo3:%[0-9]+]]:edc = MOV_PD_imm10_pseudo 4 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm10_pseudo]], %subreg.sub_mod, [[MOV_PD_imm10_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm10_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm10_pseudo3]], %subreg.sub_dim_count - ; CHECK-NEXT: [[VLDA_2D_dmw_lda_w:%[0-9]+]]:vec256, [[VLDA_2D_dmw_lda_w1:%[0-9]+]]:ep, [[VLDA_2D_dmw_lda_w2:%[0-9]+]]:edc = VLDA_2D_dmw_lda_w [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLD_2D_pseudo:%[0-9]+]]:vec256, [[VLD_2D_pseudo1:%[0-9]+]]:ep, [[VLD_2D_pseudo2:%[0-9]+]]:edc = VLD_2D_pseudo [[COPY]], [[REG_SEQUENCE]] :: (load (<16 x s16>)) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:mss = COPY [[COPY1]] - ; CHECK-NEXT: [[VUPS_S32_S16_mv_ups_w2b:%[0-9]+]]:acc512 = VUPS_S32_S16_mv_ups_w2b [[VLDA_2D_dmw_lda_w]], [[COPY2]], implicit-def dead $srups_of, implicit $crsat - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VUPS_S32_S16_mv_ups_w2b]], implicit [[VLDA_2D_dmw_lda_w]] + ; CHECK-NEXT: [[VUPS_S32_S16_mv_ups_w2b:%[0-9]+]]:acc512 = VUPS_S32_S16_mv_ups_w2b [[VLD_2D_pseudo]], [[COPY2]], implicit-def dead $srups_of, implicit $crsat + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VUPS_S32_S16_mv_ups_w2b]], implicit [[VLD_2D_pseudo]] %0:ptrregbank(p0) = COPY $p0 %1:em(s20) = G_CONSTANT i20 1 %2:edj(s20) = G_CONSTANT i20 2 diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-pre-post-increment.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-pre-post-increment.mir index c6eb9a147cf7..bbe186cf1ca4 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-pre-post-increment.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-pre-post-increment.mir @@ -23,13 +23,13 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] ; CHECK-NEXT: [[MOV_PD_imm10_pseudo:%[0-9]+]]:em = MOV_PD_imm10_pseudo 16 ; CHECK-NEXT: [[MOVXM_lng_cg:%[0-9]+]]:em_as_32bit = MOVXM_lng_cg 2048 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm [[COPY]], [[COPY2]] :: (load (<16 x s16>)) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLDA_dmw_lda_w_ag_pstm_nrm1]], 0 :: (load (<16 x s16>)) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm2:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm3:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1]], [[MOV_PD_imm10_pseudo]] :: (load (<16 x s16>)) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm2:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLDA_dmw_lda_w_ag_pstm_nrm3]], -2048 :: (load (<16 x s16>)) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm4:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3]], 2016 :: (load (<16 x s16>)) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm4:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm5:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm [[VLDA_dmw_lda_w_ag_pstm_nrm_imm5]], [[MOVXM_lng_cg]] :: (load (<16 x s16>)) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDA_dmw_lda_w_ag_pstm_nrm4]] + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY2]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLD_pstm_imm_4x32_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_imm_4x32_pseudo1:%[0-9]+]]:ep = VLD_pstm_imm_4x32_pseudo [[VLD_pstm_pseudo1]], 0 :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLD_pstm_pseudo2:%[0-9]+]]:vec256, [[VLD_pstm_pseudo3:%[0-9]+]]:ep = VLD_pstm_pseudo [[VLD_pstm_imm_4x32_pseudo1]], [[MOV_PD_imm10_pseudo]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLD_pstm_pseudo3]], -2048 :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm2:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1]], 2016 :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLD_pstm_pseudo4:%[0-9]+]]:vec256, [[VLD_pstm_pseudo5:%[0-9]+]]:ep = VLD_pstm_pseudo [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3]], [[MOVXM_lng_cg]] :: (load (<16 x s16>)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLD_pstm_pseudo4]] %0:ptrregbank(p0) = COPY $p0 %1:gprregbank(s32) = COPY $r0 %7:modregbank(s20) = G_TRUNC %1 @@ -47,6 +47,278 @@ body: | PseudoRET implicit $lr, implicit %18 ... +--- +name: post-inc-vector-256-load-w_overMaxOffset_slotB +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0, $r0, $r1 + ; CHECK-LABEL: name: post-inc-vector-256-load-w_overMaxOffset_slotB + ; CHECK: liveins: $p0, $r0, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY2]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLD_pstm_pseudo1]], 256 :: (load (<16 x s16>)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDA_dmw_lda_w_ag_pstm_nrm_imm]] + %0:ptrregbank(p0) = COPY $p0 + %1:gprregbank(s32) = COPY $r0 + %7:modregbank(s20) = G_TRUNC %1 + %8:modregbank(s20) = G_CONSTANT i20 256 + %13:vregbank(<16 x s16>), %19:ptrregbank(p0) = G_AIE_POSTINC_LOAD %0, %7 :: (load (<16 x s16>)) + %14:vregbank(<16 x s16>), %20:ptrregbank(p0) = G_AIE_POSTINC_LOAD %19, %8 :: (load (<16 x s16>)) + PseudoRET implicit $lr, implicit %14 +... + +--- +name: post-inc-vector-256-load-w_belowMinOffset_slotB +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0, $r0, $r1 + ; CHECK-LABEL: name: post-inc-vector-256-load-w_belowMinOffset_slotB + ; CHECK: liveins: $p0, $r0, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY2]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLD_pstm_pseudo1]], -288 :: (load (<16 x s16>)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDA_dmw_lda_w_ag_pstm_nrm_imm]] + %0:ptrregbank(p0) = COPY $p0 + %1:gprregbank(s32) = COPY $r0 + %7:modregbank(s20) = G_TRUNC %1 + %8:modregbank(s20) = G_CONSTANT i20 -288 + %13:vregbank(<16 x s16>), %19:ptrregbank(p0) = G_AIE_POSTINC_LOAD %0, %7 :: (load (<16 x s16>)) + %14:vregbank(<16 x s16>), %20:ptrregbank(p0) = G_AIE_POSTINC_LOAD %19, %8 :: (load (<16 x s16>)) + PseudoRET implicit $lr, implicit %14 +... + +--- +name: post-inc-vector-256-load-w_atMaxOffset_slotB +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0, $r0, $r1 + ; CHECK-LABEL: name: post-inc-vector-256-load-w_atMaxOffset_slotB + ; CHECK: liveins: $p0, $r0, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY2]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLD_pstm_imm_4x32_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_imm_4x32_pseudo1:%[0-9]+]]:ep = VLD_pstm_imm_4x32_pseudo [[VLD_pstm_pseudo1]], 224 :: (load (<16 x s16>)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLD_pstm_imm_4x32_pseudo]] + %0:ptrregbank(p0) = COPY $p0 + %1:gprregbank(s32) = COPY $r0 + %7:modregbank(s20) = G_TRUNC %1 + %8:modregbank(s20) = G_CONSTANT i20 224 + %13:vregbank(<16 x s16>), %19:ptrregbank(p0) = G_AIE_POSTINC_LOAD %0, %7 :: (load (<16 x s16>)) + %14:vregbank(<16 x s16>), %20:ptrregbank(p0) = G_AIE_POSTINC_LOAD %19, %8 :: (load (<16 x s16>)) + PseudoRET implicit $lr, implicit %14 +... + +--- +name: post-inc-vector-256-load-w_atMinOffset_slotB +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0, $r0, $r1 + ; CHECK-LABEL: name: post-inc-vector-256-load-w_atMinOffset_slotB + ; CHECK: liveins: $p0, $r0, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY2]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLD_pstm_imm_4x32_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_imm_4x32_pseudo1:%[0-9]+]]:ep = VLD_pstm_imm_4x32_pseudo [[VLD_pstm_pseudo1]], -256 :: (load (<16 x s16>)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLD_pstm_imm_4x32_pseudo]] + %0:ptrregbank(p0) = COPY $p0 + %1:gprregbank(s32) = COPY $r0 + %7:modregbank(s20) = G_TRUNC %1 + %8:modregbank(s20) = G_CONSTANT i20 -256 + %13:vregbank(<16 x s16>), %19:ptrregbank(p0) = G_AIE_POSTINC_LOAD %0, %7 :: (load (<16 x s16>)) + %14:vregbank(<16 x s16>), %20:ptrregbank(p0) = G_AIE_POSTINC_LOAD %19, %8 :: (load (<16 x s16>)) + PseudoRET implicit $lr, implicit %14 +... + +--- +name: post-inc-vector-512-load-w_overMaxOffset_slotB +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0, $r0, $r1 + ; CHECK-LABEL: name: post-inc-vector-512-load-w_overMaxOffset_slotB + ; CHECK: liveins: $p0, $r0, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY2]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[VLD_pstm_pseudo1]], 32 :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLD_pstm_pseudo1]], 256 :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE]] + %0:ptrregbank(p0) = COPY $p0 + %1:gprregbank(s32) = COPY $r0 + %7:modregbank(s20) = G_TRUNC %1 + %8:modregbank(s20) = G_CONSTANT i20 256 + %13:vregbank(<16 x s16>), %19:ptrregbank(p0) = G_AIE_POSTINC_LOAD %0, %7 :: (load (<16 x s16>)) + %14:vregbank(<16 x s32>), %20:ptrregbank(p0) = G_AIE_POSTINC_LOAD %19, %8 :: (load (<16 x s32>)) + PseudoRET implicit $lr, implicit %14 +... + +--- +name: post-inc-vector-512-load-w_belowMinOffset_slotB +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0, $r0, $r1 + ; CHECK-LABEL: name: post-inc-vector-512-load-w_belowMinOffset_slotB + ; CHECK: liveins: $p0, $r0, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY2]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[VLD_pstm_pseudo1]], 32 :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLD_pstm_pseudo1]], -288 :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE]] + %0:ptrregbank(p0) = COPY $p0 + %1:gprregbank(s32) = COPY $r0 + %7:modregbank(s20) = G_TRUNC %1 + %8:modregbank(s20) = G_CONSTANT i20 -288 + %13:vregbank(<16 x s16>), %19:ptrregbank(p0) = G_AIE_POSTINC_LOAD %0, %7 :: (load (<16 x s16>)) + %14:vregbank(<16 x s32>), %20:ptrregbank(p0) = G_AIE_POSTINC_LOAD %19, %8 :: (load (<16 x s32>)) + PseudoRET implicit $lr, implicit %14 +... + +--- +name: post-inc-vector-512-load-w_atMaxOffset_slotB +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0, $r0, $r1 + ; CHECK-LABEL: name: post-inc-vector-512-load-w_atMaxOffset_slotB + ; CHECK: liveins: $p0, $r0, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY2]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[VLD_pstm_pseudo1]], 32 :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_pstm_imm_4x32_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_imm_4x32_pseudo1:%[0-9]+]]:ep = VLD_pstm_imm_4x32_pseudo [[VLD_pstm_pseudo1]], 224 :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_pstm_imm_4x32_pseudo]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE]] + %0:ptrregbank(p0) = COPY $p0 + %1:gprregbank(s32) = COPY $r0 + %7:modregbank(s20) = G_TRUNC %1 + %8:modregbank(s20) = G_CONSTANT i20 224 + %13:vregbank(<16 x s16>), %19:ptrregbank(p0) = G_AIE_POSTINC_LOAD %0, %7 :: (load (<16 x s16>)) + %14:vregbank(<16 x s32>), %20:ptrregbank(p0) = G_AIE_POSTINC_LOAD %19, %8 :: (load (<16 x s32>)) + PseudoRET implicit $lr, implicit %14 +... + +--- +name: post-inc-vector-512-load-w_atMinOffset_slotB +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0, $r0, $r1 + ; CHECK-LABEL: name: post-inc-vector-512-load-w_atMinOffset_slotB + ; CHECK: liveins: $p0, $r0, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY2]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[VLD_pstm_pseudo1]], 32 :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_pstm_imm_4x32_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_imm_4x32_pseudo1:%[0-9]+]]:ep = VLD_pstm_imm_4x32_pseudo [[VLD_pstm_pseudo1]], -256 :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_pstm_imm_4x32_pseudo]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE]] + %0:ptrregbank(p0) = COPY $p0 + %1:gprregbank(s32) = COPY $r0 + %7:modregbank(s20) = G_TRUNC %1 + %8:modregbank(s20) = G_CONSTANT i20 -256 + %13:vregbank(<16 x s16>), %19:ptrregbank(p0) = G_AIE_POSTINC_LOAD %0, %7 :: (load (<16 x s16>)) + %14:vregbank(<16 x s32>), %20:ptrregbank(p0) = G_AIE_POSTINC_LOAD %19, %8 :: (load (<16 x s32>)) + PseudoRET implicit $lr, implicit %14 +... + +--- +name: post-inc-vector-512-load-w_MaxOffset-32_slotB +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0, $r0, $r1 + ; CHECK-LABEL: name: post-inc-vector-512-load-w_MaxOffset-32_slotB + ; CHECK: liveins: $p0, $r0, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY2]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[VLD_pstm_pseudo1]], 32 :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_pstm_imm_4x32_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_imm_4x32_pseudo1:%[0-9]+]]:ep = VLD_pstm_imm_4x32_pseudo [[VLD_pstm_pseudo1]], 192 :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_pstm_imm_4x32_pseudo]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE]] + %0:ptrregbank(p0) = COPY $p0 + %1:gprregbank(s32) = COPY $r0 + %7:modregbank(s20) = G_TRUNC %1 + %8:modregbank(s20) = G_CONSTANT i20 192 + %13:vregbank(<16 x s16>), %19:ptrregbank(p0) = G_AIE_POSTINC_LOAD %0, %7 :: (load (<16 x s16>)) + %14:vregbank(<16 x s32>), %20:ptrregbank(p0) = G_AIE_POSTINC_LOAD %19, %8 :: (load (<16 x s32>)) + PseudoRET implicit $lr, implicit %14 +... + +--- +name: post-inc-vector-512-load-w_MinOffset-add-32_slotB +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0, $r0, $r1 + ; CHECK-LABEL: name: post-inc-vector-512-load-w_MinOffset-add-32_slotB + ; CHECK: liveins: $p0, $r0, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:er = COPY $r0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY2]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[VLD_pstm_pseudo1]], 32 :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_pstm_imm_4x32_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_imm_4x32_pseudo1:%[0-9]+]]:ep = VLD_pstm_imm_4x32_pseudo [[VLD_pstm_pseudo1]], -224 :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_pstm_imm_4x32_pseudo]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE]] + %0:ptrregbank(p0) = COPY $p0 + %1:gprregbank(s32) = COPY $r0 + %7:modregbank(s20) = G_TRUNC %1 + %8:modregbank(s20) = G_CONSTANT i20 -224 + %13:vregbank(<16 x s16>), %19:ptrregbank(p0) = G_AIE_POSTINC_LOAD %0, %7 :: (load (<16 x s16>)) + %14:vregbank(<16 x s32>), %20:ptrregbank(p0) = G_AIE_POSTINC_LOAD %19, %8 :: (load (<16 x s32>)) + PseudoRET implicit $lr, implicit %14 +... + --- name: post-inc-vector-load-am legalized: true @@ -187,24 +459,24 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] ; CHECK-NEXT: [[MOV_PD_imm10_pseudo:%[0-9]+]]:em = MOV_PD_imm10_pseudo 16 ; CHECK-NEXT: [[MOVXM_lng_cg:%[0-9]+]]:em_as_32bit = MOVXM_lng_cg 2048 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 32 :: (load (<8 x s32>) from stack - 32, basealign 64) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm [[COPY]], [[COPY2]] :: (load (<8 x s32>) from stack - 64, align 64) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm1:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[VLDA_dmw_lda_w_ag_pstm_nrm1]], 32 :: (load (<8 x s32>) from stack - 32, basealign 64) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLDA_dmw_lda_w_ag_pstm_nrm1]], 0 :: (load (<8 x s32>) from stack - 64, align 64) - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm1]], %subreg.sub_256_hi - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm2:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1]], 32 :: (load (<8 x s32>) from stack - 32, basealign 64) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm2:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm3:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1]], [[MOV_PD_imm10_pseudo]] :: (load (<8 x s32>) from stack - 64, align 64) - ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm2]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm2]], %subreg.sub_256_hi - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm3:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[VLDA_dmw_lda_w_ag_pstm_nrm3]], 32 :: (load (<8 x s32>) from stack - 32, basealign 64) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm2:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLDA_dmw_lda_w_ag_pstm_nrm3]], -2048 :: (load (<8 x s32>) from stack - 64, align 64) - ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm2]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm3]], %subreg.sub_256_hi - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm4:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3]], 32 :: (load (<8 x s32>) from stack - 32, basealign 64) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm4:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm5:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3]], 2016 :: (load (<8 x s32>) from stack - 64, align 64) - ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm4]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm4]], %subreg.sub_256_hi - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm5:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[VLDA_dmw_lda_w_ag_pstm_nrm_imm5]], 32 :: (load (<8 x s32>) from stack - 32, basealign 64) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm4:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm5:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm [[VLDA_dmw_lda_w_ag_pstm_nrm_imm5]], [[MOVXM_lng_cg]] :: (load (<8 x s32>) from stack - 64, align 64) - ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm4]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm5]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 32 :: (load (<8 x s32>) from stack - 32, basealign 64) + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY2]] :: (load (<8 x s32>) from stack - 64, align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_pstm_pseudo]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[VLD_pstm_pseudo1]], 32 :: (load (<8 x s32>) from stack - 32, basealign 64) + ; CHECK-NEXT: [[VLD_pstm_imm_4x32_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_imm_4x32_pseudo1:%[0-9]+]]:ep = VLD_pstm_imm_4x32_pseudo [[VLD_pstm_pseudo1]], 0 :: (load (<8 x s32>) from stack - 64, align 64) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_pstm_imm_4x32_pseudo]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo1]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo2:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[VLD_pstm_imm_4x32_pseudo1]], 32 :: (load (<8 x s32>) from stack - 32, basealign 64) + ; CHECK-NEXT: [[VLD_pstm_pseudo2:%[0-9]+]]:vec256, [[VLD_pstm_pseudo3:%[0-9]+]]:ep = VLD_pstm_pseudo [[VLD_pstm_imm_4x32_pseudo1]], [[MOV_PD_imm10_pseudo]] :: (load (<8 x s32>) from stack - 64, align 64) + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_pstm_pseudo2]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo2]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo3:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[VLD_pstm_pseudo3]], 32 :: (load (<8 x s32>) from stack - 32, basealign 64) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLD_pstm_pseudo3]], -2048 :: (load (<8 x s32>) from stack - 64, align 64) + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo3]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo4:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1]], 32 :: (load (<8 x s32>) from stack - 32, basealign 64) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm_imm2:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm_imm [[VLDA_dmw_lda_w_ag_pstm_nrm_imm1]], 2016 :: (load (<8 x s32>) from stack - 64, align 64) + ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_pstm_nrm_imm2]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo4]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo5:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3]], 32 :: (load (<8 x s32>) from stack - 32, basealign 64) + ; CHECK-NEXT: [[VLD_pstm_pseudo4:%[0-9]+]]:vec256, [[VLD_pstm_pseudo5:%[0-9]+]]:ep = VLD_pstm_pseudo [[VLDA_dmw_lda_w_ag_pstm_nrm_imm3]], [[MOVXM_lng_cg]] :: (load (<8 x s32>) from stack - 64, align 64) + ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_pstm_pseudo4]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo5]], %subreg.sub_256_hi ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE5]] %0:ptrregbank(p0) = COPY $p0 %1:gprregbank(s32) = COPY $r0 @@ -240,8 +512,8 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm10_pseudo2:%[0-9]+]]:edn = MOV_PD_imm10_pseudo 3 ; CHECK-NEXT: [[MOV_PD_imm10_pseudo3:%[0-9]+]]:edc = MOV_PD_imm10_pseudo 4 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm10_pseudo]], %subreg.sub_mod, [[MOV_PD_imm10_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm10_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm10_pseudo3]], %subreg.sub_dim_count - ; CHECK-NEXT: [[VLDA_2D_dmw_lda_w:%[0-9]+]]:vec256, [[VLDA_2D_dmw_lda_w1:%[0-9]+]]:ep, [[VLDA_2D_dmw_lda_w2:%[0-9]+]]:edc = VLDA_2D_dmw_lda_w [[COPY]], [[REG_SEQUENCE]] :: (load (<8 x s32>)) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDA_2D_dmw_lda_w]] + ; CHECK-NEXT: [[VLD_2D_pseudo:%[0-9]+]]:vec256, [[VLD_2D_pseudo1:%[0-9]+]]:ep, [[VLD_2D_pseudo2:%[0-9]+]]:edc = VLD_2D_pseudo [[COPY]], [[REG_SEQUENCE]] :: (load (<8 x s32>)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLD_2D_pseudo]] %0:ptrregbank(p0) = COPY $p0 %1:em(s20) = G_CONSTANT i20 1 %2:edj(s20) = G_CONSTANT i20 2 @@ -268,9 +540,9 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm10_pseudo2:%[0-9]+]]:edn = MOV_PD_imm10_pseudo 3 ; CHECK-NEXT: [[MOV_PD_imm10_pseudo3:%[0-9]+]]:edc = MOV_PD_imm10_pseudo 4 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ed = REG_SEQUENCE [[MOV_PD_imm10_pseudo]], %subreg.sub_mod, [[MOV_PD_imm10_pseudo2]], %subreg.sub_dim_size, [[MOV_PD_imm10_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm10_pseudo3]], %subreg.sub_dim_count - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 32 :: (load (<8 x s32>) from unknown-address + 32) - ; CHECK-NEXT: [[VLDA_2D_dmw_lda_w:%[0-9]+]]:vec256, [[VLDA_2D_dmw_lda_w1:%[0-9]+]]:ep, [[VLDA_2D_dmw_lda_w2:%[0-9]+]]:edc = VLDA_2D_dmw_lda_w [[COPY]], [[REG_SEQUENCE]] :: (load (<8 x s32>), align 64) - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_2D_dmw_lda_w]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 32 :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_2D_pseudo:%[0-9]+]]:vec256, [[VLD_2D_pseudo1:%[0-9]+]]:ep, [[VLD_2D_pseudo2:%[0-9]+]]:edc = VLD_2D_pseudo [[COPY]], [[REG_SEQUENCE]] :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_2D_pseudo]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE1]] %0:ptrregbank(p0) = COPY $p0 %1:em(s20) = G_CONSTANT i20 1 @@ -657,8 +929,8 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm10_pseudo5:%[0-9]+]]:edc = MOV_PD_imm10_pseudo 6 ; CHECK-NEXT: [[MOV_PD_imm10_pseudo6:%[0-9]+]]:edc = MOV_PD_imm10_pseudo 7 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm10_pseudo]], %subreg.sub_mod, [[MOV_PD_imm10_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm10_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm10_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm10_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm10_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm10_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count - ; CHECK-NEXT: [[VLDA_3D_dmw_lda_w:%[0-9]+]]:vec256, [[VLDA_3D_dmw_lda_w1:%[0-9]+]]:ep, [[VLDA_3D_dmw_lda_w2:%[0-9]+]]:edc, [[VLDA_3D_dmw_lda_w3:%[0-9]+]]:edc = VLDA_3D_dmw_lda_w [[COPY]], [[REG_SEQUENCE]] :: (load (<8 x s32>)) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLDA_3D_dmw_lda_w]] + ; CHECK-NEXT: [[VLD_3D_pseudo:%[0-9]+]]:vec256, [[VLD_3D_pseudo1:%[0-9]+]]:ep, [[VLD_3D_pseudo2:%[0-9]+]]:edc, [[VLD_3D_pseudo3:%[0-9]+]]:edc = VLD_3D_pseudo [[COPY]], [[REG_SEQUENCE]] :: (load (<8 x s32>)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VLD_3D_pseudo]] %0:ptrregbank(p0) = COPY $p0 %1:em(s20) = G_CONSTANT i20 1 %2:edj(s20) = G_CONSTANT i20 2 @@ -691,9 +963,9 @@ body: | ; CHECK-NEXT: [[MOV_PD_imm10_pseudo5:%[0-9]+]]:edc = MOV_PD_imm10_pseudo 6 ; CHECK-NEXT: [[MOV_PD_imm10_pseudo6:%[0-9]+]]:edc = MOV_PD_imm10_pseudo 7 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:eds = REG_SEQUENCE [[MOV_PD_imm10_pseudo]], %subreg.sub_mod, [[MOV_PD_imm10_pseudo3]], %subreg.sub_dim_size, [[MOV_PD_imm10_pseudo1]], %subreg.sub_dim_stride, [[MOV_PD_imm10_pseudo5]], %subreg.sub_dim_count, [[MOV_PD_imm10_pseudo4]], %subreg.sub_hi_dim_then_sub_dim_size, [[MOV_PD_imm10_pseudo2]], %subreg.sub_hi_dim_then_sub_dim_stride, [[MOV_PD_imm10_pseudo6]], %subreg.sub_hi_dim_then_sub_dim_count - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 32 :: (load (<8 x s32>) from unknown-address + 32) - ; CHECK-NEXT: [[VLDA_3D_dmw_lda_w:%[0-9]+]]:vec256, [[VLDA_3D_dmw_lda_w1:%[0-9]+]]:ep, [[VLDA_3D_dmw_lda_w2:%[0-9]+]]:edc, [[VLDA_3D_dmw_lda_w3:%[0-9]+]]:edc = VLDA_3D_dmw_lda_w [[COPY]], [[REG_SEQUENCE]] :: (load (<8 x s32>), align 64) - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_3D_dmw_lda_w]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 32 :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_3D_pseudo:%[0-9]+]]:vec256, [[VLD_3D_pseudo1:%[0-9]+]]:ep, [[VLD_3D_pseudo2:%[0-9]+]]:edc, [[VLD_3D_pseudo3:%[0-9]+]]:edc = VLD_3D_pseudo [[COPY]], [[REG_SEQUENCE]] :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_3D_pseudo]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE1]] %0:ptrregbank(p0) = COPY $p0 %1:em(s20) = G_CONSTANT i20 1 diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vector-load.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vector-load.mir index 66253d680e77..324afe8df180 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vector-load.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vector-load.mir @@ -18,8 +18,8 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 0 :: (load (<32 x s8>)) - ; CHECK-NEXT: $wl0 = COPY [[VLDA_dmw_lda_w_ag_idx_imm]] + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 0 :: (load (<32 x s8>)) + ; CHECK-NEXT: $wl0 = COPY [[VLD_idx_imm_3x32_pseudo]] %1:ptrregbank(p0) = COPY $p0 %0:vregbank(<32 x s8>) = G_LOAD %1(p0) :: (load (<32 x s8>)) $wl0 = COPY %0(<32 x s8>) @@ -36,8 +36,8 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 0 :: (load (<16 x s16>)) - ; CHECK-NEXT: $wl0 = COPY [[VLDA_dmw_lda_w_ag_idx_imm]] + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 0 :: (load (<16 x s16>)) + ; CHECK-NEXT: $wl0 = COPY [[VLD_idx_imm_3x32_pseudo]] %1:ptrregbank(p0) = COPY $p0 %0:vregbank(<16 x s16>) = G_LOAD %1(p0) :: (load (<16 x s16>)) $wl0 = COPY %0(<16 x s16>) @@ -54,8 +54,8 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 0 :: (load (<8 x s32>)) - ; CHECK-NEXT: $wl0 = COPY [[VLDA_dmw_lda_w_ag_idx_imm]] + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 0 :: (load (<8 x s32>)) + ; CHECK-NEXT: $wl0 = COPY [[VLD_idx_imm_3x32_pseudo]] %1:ptrregbank(p0) = COPY $p0 %0:vregbank(<8 x s32>) = G_LOAD %1(p0) :: (load (<8 x s32>)) $wl0 = COPY %0(<8 x s32>) @@ -108,9 +108,9 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 32 :: (load (<32 x s8>) from unknown-address + 32) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm1:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 0 :: (load (<32 x s8>), align 64) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_idx_imm1]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 32 :: (load (<32 x s8>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 0 :: (load (<32 x s8>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_idx_imm_3x32_pseudo1]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi ; CHECK-NEXT: $x0 = COPY [[REG_SEQUENCE]] %1:ptrregbank(p0) = COPY $p0 %0:vregbank(<64 x s8>) = G_LOAD %1(p0) :: (load (<64 x s8>)) @@ -128,9 +128,9 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 32 :: (load (<16 x s16>) from unknown-address + 32) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm1:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 0 :: (load (<16 x s16>), align 64) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_idx_imm1]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 32 :: (load (<16 x s16>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 0 :: (load (<16 x s16>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_idx_imm_3x32_pseudo1]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi ; CHECK-NEXT: $x0 = COPY [[REG_SEQUENCE]] %1:ptrregbank(p0) = COPY $p0 %0:vregbank(<32 x s16>) = G_LOAD %1(p0) :: (load (<32 x s16>)) @@ -148,9 +148,9 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 32 :: (load (<8 x s32>) from unknown-address + 32) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm1:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 0 :: (load (<8 x s32>), align 64) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_idx_imm1]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 32 :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 0 :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_idx_imm_3x32_pseudo1]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi ; CHECK-NEXT: $x0 = COPY [[REG_SEQUENCE]] %1:ptrregbank(p0) = COPY $p0 %0:vregbank(<16 x s32>) = G_LOAD %1(p0) :: (load (<16 x s32>)) @@ -208,16 +208,16 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 96 :: (load (<128 x s8>)) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm1:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 64 :: (load (<128 x s8>)) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 96 :: (load (<128 x s8>)) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 64 :: (load (<128 x s8>)) ; CHECK-NEXT: [[DEF:%[0-9]+]]:mshfldst = IMPLICIT_DEF - ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF]], [[VLDA_dmw_lda_w_ag_idx_imm1]], %subreg.sub_256_lo - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG]], [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm2:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 32 :: (load (<128 x s8>)) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm3:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 0 :: (load (<128 x s8>)) + ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF]], [[VLD_idx_imm_3x32_pseudo1]], %subreg.sub_256_lo + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG]], [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo2:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 32 :: (load (<128 x s8>)) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo3:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 0 :: (load (<128 x s8>)) ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mshfldst = IMPLICIT_DEF - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF1]], [[VLDA_dmw_lda_w_ag_idx_imm3]], %subreg.sub_256_lo - ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG2]], [[VLDA_dmw_lda_w_ag_idx_imm2]], %subreg.sub_256_hi + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF1]], [[VLD_idx_imm_3x32_pseudo3]], %subreg.sub_256_lo + ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG2]], [[VLD_idx_imm_3x32_pseudo2]], %subreg.sub_256_hi ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vec1024 = IMPLICIT_DEF ; CHECK-NEXT: [[INSERT_SUBREG4:%[0-9]+]]:vec1024 = INSERT_SUBREG [[DEF2]], [[INSERT_SUBREG3]], %subreg.sub_512_lo ; CHECK-NEXT: [[INSERT_SUBREG5:%[0-9]+]]:vec1024 = INSERT_SUBREG [[INSERT_SUBREG4]], [[INSERT_SUBREG1]], %subreg.sub_512_hi @@ -238,16 +238,16 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 96 :: (load (<64 x s16>)) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm1:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 64 :: (load (<64 x s16>)) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 96 :: (load (<64 x s16>)) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 64 :: (load (<64 x s16>)) ; CHECK-NEXT: [[DEF:%[0-9]+]]:mshfldst = IMPLICIT_DEF - ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF]], [[VLDA_dmw_lda_w_ag_idx_imm1]], %subreg.sub_256_lo - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG]], [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm2:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 32 :: (load (<64 x s16>)) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm3:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 0 :: (load (<64 x s16>)) + ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF]], [[VLD_idx_imm_3x32_pseudo1]], %subreg.sub_256_lo + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG]], [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo2:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 32 :: (load (<64 x s16>)) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo3:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 0 :: (load (<64 x s16>)) ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mshfldst = IMPLICIT_DEF - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF1]], [[VLDA_dmw_lda_w_ag_idx_imm3]], %subreg.sub_256_lo - ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG2]], [[VLDA_dmw_lda_w_ag_idx_imm2]], %subreg.sub_256_hi + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF1]], [[VLD_idx_imm_3x32_pseudo3]], %subreg.sub_256_lo + ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG2]], [[VLD_idx_imm_3x32_pseudo2]], %subreg.sub_256_hi ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vec1024 = IMPLICIT_DEF ; CHECK-NEXT: [[INSERT_SUBREG4:%[0-9]+]]:vec1024 = INSERT_SUBREG [[DEF2]], [[INSERT_SUBREG3]], %subreg.sub_512_lo ; CHECK-NEXT: [[INSERT_SUBREG5:%[0-9]+]]:vec1024 = INSERT_SUBREG [[INSERT_SUBREG4]], [[INSERT_SUBREG1]], %subreg.sub_512_hi @@ -268,16 +268,16 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 96 :: (load (<32 x s32>)) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm1:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 64 :: (load (<32 x s32>)) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 96 :: (load (<32 x s32>)) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 64 :: (load (<32 x s32>)) ; CHECK-NEXT: [[DEF:%[0-9]+]]:mshfldst = IMPLICIT_DEF - ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF]], [[VLDA_dmw_lda_w_ag_idx_imm1]], %subreg.sub_256_lo - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG]], [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm2:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 32 :: (load (<32 x s32>)) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm3:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 0 :: (load (<32 x s32>)) + ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF]], [[VLD_idx_imm_3x32_pseudo1]], %subreg.sub_256_lo + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG]], [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo2:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 32 :: (load (<32 x s32>)) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo3:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 0 :: (load (<32 x s32>)) ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mshfldst = IMPLICIT_DEF - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF1]], [[VLDA_dmw_lda_w_ag_idx_imm3]], %subreg.sub_256_lo - ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG2]], [[VLDA_dmw_lda_w_ag_idx_imm2]], %subreg.sub_256_hi + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF1]], [[VLD_idx_imm_3x32_pseudo3]], %subreg.sub_256_lo + ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG2]], [[VLD_idx_imm_3x32_pseudo2]], %subreg.sub_256_hi ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vec1024 = IMPLICIT_DEF ; CHECK-NEXT: [[INSERT_SUBREG4:%[0-9]+]]:vec1024 = INSERT_SUBREG [[DEF2]], [[INSERT_SUBREG3]], %subreg.sub_512_lo ; CHECK-NEXT: [[INSERT_SUBREG5:%[0-9]+]]:vec1024 = INSERT_SUBREG [[INSERT_SUBREG4]], [[INSERT_SUBREG1]], %subreg.sub_512_hi diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vlda_conv.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vlda_conv.mir index a62f1aed4ca2..cd45525a6057 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vlda_conv.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vlda_conv.mir @@ -200,9 +200,9 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY $m0 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec256 = COPY $amll0 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm [[COPY]], [[COPY2]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY2]] :: (load (<16 x s16>)) ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY3]], [[COPY1]], 0 :: (store (<16 x s16>)) - ; CHECK-NEXT: [[VCONV_FP32_BF16_:%[0-9]+]]:acc512 = VCONV_FP32_BF16 [[VLDA_dmw_lda_w_ag_pstm_nrm]] + ; CHECK-NEXT: [[VCONV_FP32_BF16_:%[0-9]+]]:acc512 = VCONV_FP32_BF16 [[VLD_pstm_pseudo]] ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VCONV_FP32_BF16_]] %0:ptrregbank(p0) = COPY $p0 %20:ptrregbank(p0) = COPY $p1 @@ -229,9 +229,9 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:em = COPY $m0 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm [[COPY]], [[COPY1]] :: (load (<16 x s16>)) - ; CHECK-NEXT: $m1 = COPY [[VLDA_dmw_lda_w_ag_pstm_nrm1]] - ; CHECK-NEXT: [[VCONV_FP32_BF16_:%[0-9]+]]:acc512 = VCONV_FP32_BF16 [[VLDA_dmw_lda_w_ag_pstm_nrm]] + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY1]] :: (load (<16 x s16>)) + ; CHECK-NEXT: $m1 = COPY [[VLD_pstm_pseudo1]] + ; CHECK-NEXT: [[VCONV_FP32_BF16_:%[0-9]+]]:acc512 = VCONV_FP32_BF16 [[VLD_pstm_pseudo]] ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VCONV_FP32_BF16_]], implicit $m1 %0:ptrregbank(p0) = COPY $p0 %7:modregbank(s20) = COPY $m0 diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vlda_ups-unsafe-to-move.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vlda_ups-unsafe-to-move.mir index 0e4c981e02ca..7dbc1ef77ac0 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vlda_ups-unsafe-to-move.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vlda_ups-unsafe-to-move.mir @@ -22,12 +22,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:edj = COPY $m0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec256 = COPY $wl0 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx [[COPY]], [[COPY1]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLD_idx_pseudo:%[0-9]+]]:vec256 = VLD_idx_pseudo [[COPY]], [[COPY1]] :: (load (<16 x s16>)) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:er = COPY $r1 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:mss = COPY [[COPY3]] ; CHECK-NEXT: [[VUPS_S32_D16_mv_ups_w2b:%[0-9]+]]:acc512 = VUPS_S32_D16_mv_ups_w2b [[COPY2]], [[COPY4]], implicit-def dead $srups_of, implicit $crsat, implicit $crupssign ; CHECK-NEXT: [[COPY5:%[0-9]+]]:mss = COPY [[COPY3]] - ; CHECK-NEXT: [[VUPS_S32_D16_mv_ups_w2b1:%[0-9]+]]:acc512 = VUPS_S32_D16_mv_ups_w2b [[VLDA_dmw_lda_w_ag_idx]], [[COPY5]], implicit-def dead $srups_of, implicit $crsat, implicit $crupssign + ; CHECK-NEXT: [[VUPS_S32_D16_mv_ups_w2b1:%[0-9]+]]:acc512 = VUPS_S32_D16_mv_ups_w2b [[VLD_idx_pseudo]], [[COPY5]], implicit-def dead $srups_of, implicit $crsat, implicit $crupssign ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VUPS_S32_D16_mv_ups_w2b]], implicit [[VUPS_S32_D16_mv_ups_w2b1]] %0:ptrregbank(p0) = COPY $p0 %1:modregbank(s20) = COPY $m0 diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vldst-mmo.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vldst-mmo.mir index ece76171b43a..7c276ceaba77 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vldst-mmo.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-vldst-mmo.mir @@ -63,16 +63,16 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p1 ; CHECK-NEXT: [[PADD_imm_pseudo:%[0-9]+]]:ep = PADD_imm_pseudo [[COPY]], 128 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 96 :: (load (<32 x s32>) from %ir.p, !tbaa !0) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm1:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 64 :: (load (<32 x s32>) from %ir.p, !tbaa !0) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 96 :: (load (<32 x s32>) from %ir.p, !tbaa !0) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 64 :: (load (<32 x s32>) from %ir.p, !tbaa !0) ; CHECK-NEXT: [[DEF:%[0-9]+]]:mshfldst = IMPLICIT_DEF - ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF]], [[VLDA_dmw_lda_w_ag_idx_imm1]], %subreg.sub_256_lo - ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG]], [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm2:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 32 :: (load (<32 x s32>) from %ir.p, !tbaa !0) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm3:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 0 :: (load (<32 x s32>) from %ir.p, !tbaa !0) + ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF]], [[VLD_idx_imm_3x32_pseudo1]], %subreg.sub_256_lo + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG]], [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo2:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 32 :: (load (<32 x s32>) from %ir.p, !tbaa !0) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo3:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[COPY]], 0 :: (load (<32 x s32>) from %ir.p, !tbaa !0) ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mshfldst = IMPLICIT_DEF - ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF1]], [[VLDA_dmw_lda_w_ag_idx_imm3]], %subreg.sub_256_lo - ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG2]], [[VLDA_dmw_lda_w_ag_idx_imm2]], %subreg.sub_256_hi + ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF1]], [[VLD_idx_imm_3x32_pseudo3]], %subreg.sub_256_lo + ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG2]], [[VLD_idx_imm_3x32_pseudo2]], %subreg.sub_256_hi ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vec1024 = IMPLICIT_DEF ; CHECK-NEXT: [[INSERT_SUBREG4:%[0-9]+]]:vec1024 = INSERT_SUBREG [[DEF2]], [[INSERT_SUBREG3]], %subreg.sub_512_lo ; CHECK-NEXT: [[INSERT_SUBREG5:%[0-9]+]]:vec1024 = INSERT_SUBREG [[INSERT_SUBREG4]], [[INSERT_SUBREG1]], %subreg.sub_512_hi @@ -86,16 +86,16 @@ body: | ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vec256 = COPY [[COPY2]].sub_256_hi ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY7]], [[COPY1]], 32 :: (store (<8 x s32>) into %ir.q + 32, basealign 128, !tbaa !0) ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY6]], [[COPY1]], 0 :: (store (<8 x s32>) into %ir.q, align 128, !tbaa !0) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm4:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[PADD_imm_pseudo]], 96 :: (load (<32 x s32>) from %ir.incdec.ptr, !tbaa !0) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm5:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[PADD_imm_pseudo]], 64 :: (load (<32 x s32>) from %ir.incdec.ptr, !tbaa !0) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo4:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[PADD_imm_pseudo]], 96 :: (load (<32 x s32>) from %ir.incdec.ptr, !tbaa !0) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo5:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[PADD_imm_pseudo]], 64 :: (load (<32 x s32>) from %ir.incdec.ptr, !tbaa !0) ; CHECK-NEXT: [[DEF3:%[0-9]+]]:mshfldst = IMPLICIT_DEF - ; CHECK-NEXT: [[INSERT_SUBREG6:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF3]], [[VLDA_dmw_lda_w_ag_idx_imm5]], %subreg.sub_256_lo - ; CHECK-NEXT: [[INSERT_SUBREG7:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG6]], [[VLDA_dmw_lda_w_ag_idx_imm4]], %subreg.sub_256_hi - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm6:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[PADD_imm_pseudo]], 32 :: (load (<32 x s32>) from %ir.incdec.ptr, !tbaa !0) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm7:%[0-9]+]]:mwa = VLDA_dmw_lda_w_ag_idx_imm [[PADD_imm_pseudo]], 0 :: (load (<32 x s32>) from %ir.incdec.ptr, !tbaa !0) + ; CHECK-NEXT: [[INSERT_SUBREG6:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF3]], [[VLD_idx_imm_3x32_pseudo5]], %subreg.sub_256_lo + ; CHECK-NEXT: [[INSERT_SUBREG7:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG6]], [[VLD_idx_imm_3x32_pseudo4]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo6:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[PADD_imm_pseudo]], 32 :: (load (<32 x s32>) from %ir.incdec.ptr, !tbaa !0) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo7:%[0-9]+]]:mwa = VLD_idx_imm_3x32_pseudo [[PADD_imm_pseudo]], 0 :: (load (<32 x s32>) from %ir.incdec.ptr, !tbaa !0) ; CHECK-NEXT: [[DEF4:%[0-9]+]]:mshfldst = IMPLICIT_DEF - ; CHECK-NEXT: [[INSERT_SUBREG8:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF4]], [[VLDA_dmw_lda_w_ag_idx_imm7]], %subreg.sub_256_lo - ; CHECK-NEXT: [[INSERT_SUBREG9:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG8]], [[VLDA_dmw_lda_w_ag_idx_imm6]], %subreg.sub_256_hi + ; CHECK-NEXT: [[INSERT_SUBREG8:%[0-9]+]]:mshfldst = INSERT_SUBREG [[DEF4]], [[VLD_idx_imm_3x32_pseudo7]], %subreg.sub_256_lo + ; CHECK-NEXT: [[INSERT_SUBREG9:%[0-9]+]]:mshfldst = INSERT_SUBREG [[INSERT_SUBREG8]], [[VLD_idx_imm_3x32_pseudo6]], %subreg.sub_256_hi ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vec1024 = IMPLICIT_DEF ; CHECK-NEXT: [[INSERT_SUBREG10:%[0-9]+]]:vec1024 = INSERT_SUBREG [[DEF5]], [[INSERT_SUBREG9]], %subreg.sub_512_lo ; CHECK-NEXT: [[INSERT_SUBREG11:%[0-9]+]]:vec1024 = INSERT_SUBREG [[INSERT_SUBREG10]], [[INSERT_SUBREG7]], %subreg.sub_512_hi @@ -129,16 +129,16 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p1 ; CHECK-NEXT: [[PADD_imm_pseudo:%[0-9]+]]:ep = PADD_imm_pseudo [[COPY]], 64 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 32 :: (load (<16 x s16>) from %ir.p + 32, basealign 64, !tbaa !0) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm1:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 0 :: (load (<16 x s16>) from %ir.p, align 64, !tbaa !0) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_idx_imm1]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 32 :: (load (<16 x s16>) from %ir.p + 32, basealign 64, !tbaa !0) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 0 :: (load (<16 x s16>) from %ir.p, align 64, !tbaa !0) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_idx_imm_3x32_pseudo1]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec256 = COPY [[REG_SEQUENCE]].sub_256_lo ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vec256 = COPY [[REG_SEQUENCE]].sub_256_hi ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY3]], [[COPY1]], 32 :: (store (<16 x s16>) into %ir.q + 32, basealign 64, !tbaa !0) ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY2]], [[COPY1]], 0 :: (store (<16 x s16>) into %ir.q, align 64, !tbaa !0) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm2:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[PADD_imm_pseudo]], 32 :: (load (<16 x s16>) from %ir.incdec.ptr + 32, basealign 64, !tbaa !0) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm3:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[PADD_imm_pseudo]], 0 :: (load (<16 x s16>) from %ir.incdec.ptr, align 64, !tbaa !0) - ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_idx_imm3]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm2]], %subreg.sub_256_hi + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo2:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[PADD_imm_pseudo]], 32 :: (load (<16 x s16>) from %ir.incdec.ptr + 32, basealign 64, !tbaa !0) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo3:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[PADD_imm_pseudo]], 0 :: (load (<16 x s16>) from %ir.incdec.ptr, align 64, !tbaa !0) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_idx_imm_3x32_pseudo3]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo2]], %subreg.sub_256_hi ; CHECK-NEXT: $x0 = COPY [[REG_SEQUENCE1]] %1:ptrregbank(p0) = COPY $p0 %2:ptrregbank(p0) = COPY $p1 @@ -166,10 +166,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ep = COPY $p1 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 0 :: (load (<32 x s8>) from %ir.p, !tbaa !0) - ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[VLDA_dmw_lda_w_ag_idx_imm]], [[COPY1]], 0 :: (store (<32 x s8>) into %ir.q, !tbaa !0) - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm1:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 32 :: (load (<32 x s8>) from %ir.incdec.ptr, !tbaa !0) - ; CHECK-NEXT: $wl0 = COPY [[VLDA_dmw_lda_w_ag_idx_imm1]] + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 0 :: (load (<32 x s8>) from %ir.p, !tbaa !0) + ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[VLD_idx_imm_3x32_pseudo]], [[COPY1]], 0 :: (store (<32 x s8>) into %ir.q, !tbaa !0) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 32 :: (load (<32 x s8>) from %ir.incdec.ptr, !tbaa !0) + ; CHECK-NEXT: $wl0 = COPY [[VLD_idx_imm_3x32_pseudo1]] %1:ptrregbank(p0) = COPY $p0 %2:ptrregbank(p0) = COPY $p1 %3:modregbank(s20) = G_CONSTANT i20 32 diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/vld-ups-combine-store-after-load.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/vld-ups-combine-store-after-load.mir index 7f0def4ba02d..5e107f545600 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/vld-ups-combine-store-after-load.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/vld-ups-combine-store-after-load.mir @@ -24,11 +24,11 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY $m0 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:er = COPY $r0 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vec256 = COPY $amll0 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm [[COPY]], [[COPY2]] :: (load (<16 x s16>)) + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY2]] :: (load (<16 x s16>)) ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY4]], [[COPY1]], 0 :: (store (<16 x s16>)) ; CHECK-NEXT: $crupssign = COPY [[COPY3]] ; CHECK-NEXT: [[COPY5:%[0-9]+]]:mss = COPY [[COPY3]] - ; CHECK-NEXT: [[VUPS_S64_D16_mv_ups_w2c:%[0-9]+]]:mcms = VUPS_S64_D16_mv_ups_w2c [[VLDA_dmw_lda_w_ag_pstm_nrm]], [[COPY5]], implicit-def $srups_of, implicit $crsat, implicit $crupssign + ; CHECK-NEXT: [[VUPS_S64_D16_mv_ups_w2c:%[0-9]+]]:mcms = VUPS_S64_D16_mv_ups_w2c [[VLD_pstm_pseudo]], [[COPY5]], implicit-def $srups_of, implicit $crsat, implicit $crupssign ; CHECK-NEXT: $crupssign = MOV_scalar_imm10_pseudo 0 ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VUPS_S64_D16_mv_ups_w2c]] %0:ptrregbank(p0) = COPY $p0 diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/vld-ups-combine-use-before-def.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/vld-ups-combine-use-before-def.mir index 0e66fbef7573..46a94086264d 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/vld-ups-combine-use-before-def.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/vld-ups-combine-use-before-def.mir @@ -22,12 +22,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:em = COPY $m0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:er = COPY $r0 - ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_pstm_nrm:%[0-9]+]]:vec256, [[VLDA_dmw_lda_w_ag_pstm_nrm1:%[0-9]+]]:ep = VLDA_dmw_lda_w_ag_pstm_nrm [[COPY]], [[COPY1]] :: (load (<16 x s16>)) - ; CHECK-NEXT: $m1 = COPY [[VLDA_dmw_lda_w_ag_pstm_nrm1]] + ; CHECK-NEXT: [[VLD_pstm_pseudo:%[0-9]+]]:vec256, [[VLD_pstm_pseudo1:%[0-9]+]]:ep = VLD_pstm_pseudo [[COPY]], [[COPY1]] :: (load (<16 x s16>)) + ; CHECK-NEXT: $m1 = COPY [[VLD_pstm_pseudo1]] ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ep = COPY $m1 ; CHECK-NEXT: $crupssign = COPY [[COPY2]] ; CHECK-NEXT: [[COPY4:%[0-9]+]]:mss = COPY [[COPY2]] - ; CHECK-NEXT: [[VUPS_S64_D16_mv_ups_w2c:%[0-9]+]]:mcms = VUPS_S64_D16_mv_ups_w2c [[VLDA_dmw_lda_w_ag_pstm_nrm]], [[COPY4]], implicit-def $srups_of, implicit $crsat, implicit $crupssign + ; CHECK-NEXT: [[VUPS_S64_D16_mv_ups_w2c:%[0-9]+]]:mcms = VUPS_S64_D16_mv_ups_w2c [[VLD_pstm_pseudo]], [[COPY4]], implicit-def $srups_of, implicit $crsat, implicit $crupssign ; CHECK-NEXT: $crupssign = MOV_scalar_imm10_pseudo 0 ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[VUPS_S64_D16_mv_ups_w2c]], implicit [[COPY3]] %0:ptrregbank(p0) = COPY $p0 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll index 8747a1d676bb..1c68dba7cb32 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll @@ -300,17 +300,17 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: .LBB0_1: // %outer.loop.header ; DCL-NEXT: // =>This Loop Header: Depth=1 ; DCL-NEXT: // Child Loop BB0_2 Depth 2 -; DCL-NEXT: vlda wl6, [p1], #32; nopxm -; DCL-NEXT: vlda wl3, [p0], m6; mov r0, p0 +; DCL-NEXT: vldb wl6, [p1], #32; nopxm +; DCL-NEXT: vldb wl3, [p0], m6; mov r0, p0 ; DCL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] -; DCL-NEXT: vlda wh6, [p1], #32 -; DCL-NEXT: vlda wh3, [p0], m6; mov m5, p4 +; DCL-NEXT: vldb wh6, [p1], #32 +; DCL-NEXT: vldb wh3, [p0], m6; mov m5, p4 ; DCL-NEXT: vlda.ups.s32.s16 bml0, s0, [p2], m5 -; DCL-NEXT: vlda wl8, [p1], #32 -; DCL-NEXT: vlda wl7, [p0], m6 +; DCL-NEXT: vldb wl8, [p1], #32 +; DCL-NEXT: vldb wl7, [p0], m6 ; DCL-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m1, p5 ; DCL-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m1 -; DCL-NEXT: vlda.3d wh7, [p0], d0 +; DCL-NEXT: vldb.3d wh7, [p0], d0 ; DCL-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32] ; DCL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5 ; DCL-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m2, r15 @@ -322,30 +322,30 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; DCL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32] ; DCL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m5 ; DCL-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32] -; DCL-NEXT: vlda wh8, [p1], #32 -; DCL-NEXT: vlda wl5, [p0], m6; mov r1, p0 +; DCL-NEXT: vldb wh8, [p1], #32 +; DCL-NEXT: vldb wl5, [p0], m6; mov r1, p0 ; DCL-NEXT: vlda.ups.s32.s16 bml7, s0, [p2, #0]; and r0, r0, r9 -; DCL-NEXT: vlda wh5, [p0], m6; add r0, r0, #33 -; DCL-NEXT: vlda wl3, [p0], m6; vshift.align x4, x4, s1, x3, r0 -; DCL-NEXT: vlda.3d wh3, [p0], d0; and r10, r1, r9; vshift.align x2, x2, s1, x7, r0 -; DCL-NEXT: vlda wl1, [p1], #32; add r0, r10, #33; mov r10, p0 -; DCL-NEXT: vlda wh1, [p1], #32; add r1, r5, #-1; vshuffle x7, x4, x2, r2 -; DCL-NEXT: vlda wl10, [p1], #32; add r1, r1, #-1; vshuffle x9, x7, x0, r8 -; DCL-NEXT: vlda wh10, [p1], #32; and r10, r10, r9 +; DCL-NEXT: vldb wh5, [p0], m6; add r0, r0, #33 +; DCL-NEXT: vldb wl3, [p0], m6; vshift.align x4, x4, s1, x3, r0 +; DCL-NEXT: vldb.3d wh3, [p0], d0; and r10, r1, r9; vshift.align x2, x2, s1, x7, r0 +; DCL-NEXT: vldb wl1, [p1], #32; add r0, r10, #33; mov r10, p0 +; DCL-NEXT: vldb wh1, [p1], #32; add r1, r5, #-1; vshuffle x7, x4, x2, r2 +; DCL-NEXT: vldb wl10, [p1], #32; add r1, r1, #-1; vshuffle x9, x7, x0, r8 +; DCL-NEXT: vldb wh10, [p1], #32; and r10, r10, r9 ; DCL-NEXT: .p2align 4 ; DCL-NEXT: .LBB0_2: // %inner.loop ; DCL-NEXT: // Parent Loop BB0_1 Depth=1 ; DCL-NEXT: // => This Inner Loop Header: Depth=2 ; DCL-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 ; DCL-NEXT: nopa ; nopb ; nopx ; vshift.align x4, x4, s1, x5, r0; vmac cm5, cm5, x9, x8, r4 -; DCL-NEXT: vlda wl5, [p0], m6; vshift.align x2, x2, s1, x3, r0 -; DCL-NEXT: vlda wh5, [p0], m6; add r1, r1, #-1; vshuffle x11, x9, x0, r8 -; DCL-NEXT: vlda wl3, [p0], m6; jnz r1, #.LBB0_2; vmac cm0, cm0, x7, x6, r4 -; DCL-NEXT: vlda.3d wh3, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 // Delay Slot 5 -; DCL-NEXT: vlda wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 // Delay Slot 4 -; DCL-NEXT: vlda wh1, [p1], #32; vmov x6, x1; vmac cm6, cm6, x9, x8, r4 // Delay Slot 3 -; DCL-NEXT: vlda wl10, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2 -; DCL-NEXT: vlda wh10, [p1], #32; and r10, r10, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 // Delay Slot 1 +; DCL-NEXT: vldb wl5, [p0], m6; vshift.align x2, x2, s1, x3, r0 +; DCL-NEXT: vldb wh5, [p0], m6; add r1, r1, #-1; vshuffle x11, x9, x0, r8 +; DCL-NEXT: vldb wl3, [p0], m6; jnz r1, #.LBB0_2; vmac cm0, cm0, x7, x6, r4 +; DCL-NEXT: vldb.3d wh3, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 // Delay Slot 5 +; DCL-NEXT: vldb wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 // Delay Slot 4 +; DCL-NEXT: vldb wh1, [p1], #32; vmov x6, x1; vmac cm6, cm6, x9, x8, r4 // Delay Slot 3 +; DCL-NEXT: vldb wl10, [p1], #32; add r0, r10, #33; mov r10, p0; vmac cm3, cm3, x11, x6, r4 // Delay Slot 2 +; DCL-NEXT: vldb wh10, [p1], #32; and r10, r10, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 // Delay Slot 1 ; DCL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 ; DCL-NEXT: nopa ; nopx ; vmov x11, x0 ; DCL-NEXT: vshuffle x0, x4, x2, r3 @@ -480,21 +480,21 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: .LBB0_1: // %outer.loop.header ; ZOL-NEXT: // =>This Loop Header: Depth=1 ; ZOL-NEXT: // Child Loop BB0_2 Depth 2 -; ZOL-NEXT: vlda wl6, [p1], #32; nopx -; ZOL-NEXT: vlda wl3, [p0], m6; mov r0, p0 +; ZOL-NEXT: vldb wl6, [p1], #32; nopx +; ZOL-NEXT: vldb wl3, [p0], m6; mov r0, p0 ; ZOL-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] -; ZOL-NEXT: vlda wh6, [p1], #32 -; ZOL-NEXT: vlda wh3, [p0], m6; mov m5, p4 +; ZOL-NEXT: vldb wh6, [p1], #32 +; ZOL-NEXT: vldb wh3, [p0], m6; mov m5, p4 ; ZOL-NEXT: vlda.ups.s32.s16 bml0, s0, [p2], m5 -; ZOL-NEXT: vlda wl8, [p1], #32 -; ZOL-NEXT: vlda wl7, [p0], m6 +; ZOL-NEXT: vldb wl8, [p1], #32 +; ZOL-NEXT: vldb wl7, [p0], m6 ; ZOL-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m1, p5 ; ZOL-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m1 -; ZOL-NEXT: vlda wh8, [p1], #32 -; ZOL-NEXT: vlda.3d wh7, [p0], d0 +; ZOL-NEXT: vldb wh8, [p1], #32 +; ZOL-NEXT: vldb.3d wh7, [p0], d0 ; ZOL-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32] ; ZOL-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m5 -; ZOL-NEXT: vlda wl1, [p1], #32 +; ZOL-NEXT: vldb wl1, [p1], #32 ; ZOL-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m2, r14 ; ZOL-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m2 ; ZOL-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32] @@ -503,30 +503,30 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m1; movxm ls, #.LBB0_2 ; ZOL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32]; add.nc r1, r5, #-2 ; ZOL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m5; mov lc, r1 -; ZOL-NEXT: vlda wl5, [p0], m6; mov r1, p0 -; ZOL-NEXT: vlda wh5, [p0], m6; movxm le, #.L_LEnd0 +; ZOL-NEXT: vldb wl5, [p0], m6; mov r1, p0 +; ZOL-NEXT: vldb wh5, [p0], m6; movxm le, #.L_LEnd0 ; ZOL-NEXT: nopb ; vlda.ups.s32.s16 bmh7, s0, [p2, #32]; nops ; and r0, r0, r9; nopm ; nopv -; ZOL-NEXT: nopb ; vlda wl3, [p0], m6; nops ; add r0, r0, #33; nopm ; nopv -; ZOL-NEXT: nopb ; vlda.3d wh3, [p0], d0; nops ; nopx ; vshift.align x4, x4, s1, x3, r0; nopv +; ZOL-NEXT: vldb wl3, [p0], m6; nopa ; nops ; add r0, r0, #33; nopm ; nopv +; ZOL-NEXT: vldb.3d wh3, [p0], d0; nopa ; nops ; nopx ; vshift.align x4, x4, s1, x3, r0; nopv ; ZOL-NEXT: nopb ; vlda.ups.s32.s16 bml7, s0, [p2, #0]; nops ; and r1, r1, r9; vshift.align x2, x2, s1, x7, r0; nopv -; ZOL-NEXT: nopb ; vlda wh1, [p1], #32; nops ; add r0, r1, #33; mov r1, p0; nopv -; ZOL-NEXT: nopb ; vlda wl10, [p1], #32; nops ; nopx ; vshuffle x7, x4, x2, r2; nopv -; ZOL-NEXT: vlda wh10, [p1], #32; nopb ; nopx ; vshuffle x9, x7, x0, r8 +; ZOL-NEXT: vldb wh1, [p1], #32; nopa ; nops ; add r0, r1, #33; mov r1, p0; nopv +; ZOL-NEXT: vldb wl10, [p1], #32; nopa ; nops ; nopx ; vshuffle x7, x4, x2, r2; nopv +; ZOL-NEXT: nopa ; vldb wh10, [p1], #32; nopx ; vshuffle x9, x7, x0, r8 ; ZOL-NEXT: and r1, r1, r9 ; ZOL-NEXT: .p2align 4 ; ZOL-NEXT: .LBB0_2: // %inner.loop ; ZOL-NEXT: // Parent Loop BB0_1 Depth=1 ; ZOL-NEXT: // => This Inner Loop Header: Depth=2 ; ZOL-NEXT: nopa ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 -; ZOL-NEXT: vlda wl5, [p0], m6; vshift.align x4, x4, s1, x5, r0; vmac cm5, cm5, x9, x8, r4 -; ZOL-NEXT: vlda wh5, [p0], m6; vshift.align x2, x2, s1, x3, r0 -; ZOL-NEXT: vlda wl3, [p0], m6; vshuffle x11, x9, x0, r8; vmac cm0, cm0, x7, x6, r4 -; ZOL-NEXT: vlda.3d wh3, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 -; ZOL-NEXT: vlda wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 -; ZOL-NEXT: vlda wh1, [p1], #32; vmov x6, x1; vmac cm6, cm6, x9, x8, r4 -; ZOL-NEXT: vlda wl10, [p1], #32; add r0, r1, #33; mov r1, p0; vmac cm3, cm3, x11, x6, r4 +; ZOL-NEXT: vldb wl5, [p0], m6; vshift.align x4, x4, s1, x5, r0; vmac cm5, cm5, x9, x8, r4 +; ZOL-NEXT: vldb wh5, [p0], m6; vshift.align x2, x2, s1, x3, r0 +; ZOL-NEXT: vldb wl3, [p0], m6; vshuffle x11, x9, x0, r8; vmac cm0, cm0, x7, x6, r4 +; ZOL-NEXT: vldb.3d wh3, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm4, cm4, x7, x8, r4 +; ZOL-NEXT: vldb wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 +; ZOL-NEXT: vldb wh1, [p1], #32; vmov x6, x1; vmac cm6, cm6, x9, x8, r4 +; ZOL-NEXT: vldb wl10, [p1], #32; add r0, r1, #33; mov r1, p0; vmac cm3, cm3, x11, x6, r4 ; ZOL-NEXT: .L_LEnd0: -; ZOL-NEXT: nopb ; vlda wh10, [p1], #32; nops ; and r1, r1, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 +; ZOL-NEXT: vldb wh10, [p1], #32; nopa ; nops ; and r1, r1, r9; vmov x8, x10; vmac cm7, cm7, x11, x8, r4 ; ZOL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 ; ZOL-NEXT: nopa ; nopx ; vmov x11, x0 ; ZOL-NEXT: vshuffle x0, x4, x2, r3 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll index ece27639a0ac..d020da70b989 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll @@ -129,17 +129,17 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ASM-NEXT: .LBB0_2: // %inner.loop ; ASM-NEXT: // Parent Loop BB0_1 Depth=1 ; ASM-NEXT: // => This Inner Loop Header: Depth=2 -; ASM-NEXT: vlda wl6, [p0], m6; nopx -; ASM-NEXT: vlda wh6, [p0], m6 -; ASM-NEXT: vlda wl8, [p0], m6 -; ASM-NEXT: vlda.3d wh8, [p0], d0 +; ASM-NEXT: vldb wl6, [p0], m6; nopx +; ASM-NEXT: vldb wh6, [p0], m6 +; ASM-NEXT: vldb wl8, [p0], m6 +; ASM-NEXT: vldb.3d wh8, [p0], d0 ; ASM-NEXT: nop -; ASM-NEXT: vlda wl10, [p1], #32 -; ASM-NEXT: vlda wh10, [p1], #32 +; ASM-NEXT: vldb wl10, [p1], #32 +; ASM-NEXT: vldb wh10, [p1], #32 ; ASM-NEXT: nop ; ASM-NEXT: nop -; ASM-NEXT: vlda wl6, [p1], #32; vshift.align x4, x4, s1, x6, r1 -; ASM-NEXT: vlda wh6, [p1], #32; vshift.align x2, x2, s1, x8, r1 +; ASM-NEXT: vldb wl6, [p1], #32; vshift.align x4, x4, s1, x6, r1 +; ASM-NEXT: vldb wh6, [p1], #32; vshift.align x2, x2, s1, x8, r1 ; ASM-NEXT: vshuffle x8, x4, x2, r2 ; ASM-NEXT: vshuffle x3, x4, x2, r3 ; ASM-NEXT: vshuffle x1, x8, x0, r8; vmac cm0, cm0, x8, x10, r4 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Memops.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Memops.ll index 737f6fa07b41..d673d99f4925 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Memops.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Memops.ll @@ -184,7 +184,7 @@ define dso_local void @lowerMemcpyUsingWordVector32() local_unnamed_addr #0 { ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopa ; nopb ; movxm p0, #buffer2 -; CHECK-NEXT: vlda wh0, [p0], #32 +; CHECK-NEXT: vldb wh0, [p0], #32 ; CHECK-NEXT: lda r0, [p0], #4 ; CHECK-NEXT: lda r1, [p0], #4 ; CHECK-NEXT: lda r2, [p0], #4 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll index 096219363439..59d6b1aeb1f8 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Mul2D.ll @@ -83,10 +83,10 @@ define void @mul2d(ptr noalias %in_ptr0, ptr noalias %in_ptr1, ptr noalias %out_ ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vlda wl2, [p1], #32; nopx -; CHECK-NEXT: vlda.3d wl6, [p0], d0 -; CHECK-NEXT: vlda wl4, [p1], #32 -; CHECK-NEXT: vlda.3d wl2, [p0], d0 +; CHECK-NEXT: vldb wl2, [p1], #32; nopx +; CHECK-NEXT: vldb.3d wl6, [p0], d0 +; CHECK-NEXT: vldb wl4, [p1], #32 +; CHECK-NEXT: vldb.3d wl2, [p0], d0 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/mmo-load.ll b/llvm/test/CodeGen/AIE/aie2/mmo-load.ll index 38049bf2c495..cc1eb5889d7f 100644 --- a/llvm/test/CodeGen/AIE/aie2/mmo-load.ll +++ b/llvm/test/CodeGen/AIE/aie2/mmo-load.ll @@ -60,9 +60,9 @@ entry: declare void @sink_v8i32(<8 x i32>, <8 x i32>) define void @load_v8i32(i32 %idx, ptr %array) { ; CHECK-LABEL: name: load_v8i32 - ; CHECK: VLDA_dmw_lda_w_ag_idx + ; CHECK: VLD_idx_pseudo ; CHECK-SAME: (load (<8 x s32>) from %ir.arrayidx.0) - ; CHECK: VLDA_dmw_lda_w_ag_idx_imm + ; CHECK: VLD_idx_imm_3x32_pseudo ; CHECK-SAME: (load (<8 x s32>) from %ir.arrayidx.1) entry: %arrayidx.0 = getelementptr inbounds [16 x <8 x i32>], ptr %array, i32 0, i32 %idx @@ -96,8 +96,8 @@ entry: declare void @sink_v32i32(<32 x i32>, <32 x i32>) define void @load_v32i32(i32 %idx, ptr %array) { ; CHECK-LABEL: name: load_v32i32 - ; CHECK-COUNT-4: VLDA_dmw_lda_w_ag_idx_imm {{.*}} (load (<32 x s32>) from %ir.arrayidx.0, align 32) - ; CHECK-COUNT-4: VLDA_dmw_lda_w_ag_idx_imm {{.*}} (load (<32 x s32>) from %ir.arrayidx.1, align 32) + ; CHECK-COUNT-4: VLD_idx_imm_3x32_pseudo {{.*}} (load (<32 x s32>) from %ir.arrayidx.0, align 32) + ; CHECK-COUNT-4: VLD_idx_imm_3x32_pseudo {{.*}} (load (<32 x s32>) from %ir.arrayidx.1, align 32) entry: %arrayidx.0 = getelementptr inbounds [16 x <32 x i32>], ptr %array, i32 0, i32 %idx %0 = load <32 x i32>, ptr %arrayidx.0, align 32 diff --git a/llvm/test/CodeGen/AIE/aie2/ra/split-instrs-create.mir b/llvm/test/CodeGen/AIE/aie2/ra/split-instrs-create.mir index 6d7b342b59d1..28cc3f4271ab 100644 --- a/llvm/test/CodeGen/AIE/aie2/ra/split-instrs-create.mir +++ b/llvm/test/CodeGen/AIE/aie2/ra/split-instrs-create.mir @@ -111,6 +111,51 @@ body: | dead %2:mwa, dead %20:ep, dead %100.sub_dim_count:eds, dead %100.sub_hi_dim_then_sub_dim_count:eds = VLDB_3D killed %20, killed %100 :: (load (<8 x s32>) from unknown-address) ... +--- +name: VLD_2D_MultiSlot +legalized: true +regBankSelected: true +selected: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $p0, $d1 + ; CHECK-LABEL: name: VLD_2D_MultiSlot + ; CHECK: liveins: $p0, $d1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ed = COPY $d1 + ; CHECK-NEXT: dead [[VLD_2D_pseudo_split:%[0-9]+]]:mwa, [[COPY:%[0-9]+]]:ep, [[COPY1:%[0-9]+]].sub_dim_count:ed = VLD_2D_pseudo_split killed [[COPY]], killed [[COPY1]].sub_mod, killed [[COPY1]].sub_dim_size, killed [[COPY1]].sub_dim_stride, killed [[COPY1]].sub_dim_count :: (load (<8 x s32>)) + ; CHECK-NEXT: dead [[VLD_2D_pseudo_split1:%[0-9]+]]:mwa, dead [[COPY:%[0-9]+]]:ep, dead [[COPY1:%[0-9]+]].sub_dim_count:ed = VLD_2D_pseudo_split killed [[COPY]], killed [[COPY1]].sub_mod, killed [[COPY1]].sub_dim_size, killed [[COPY1]].sub_dim_stride, killed [[COPY1]].sub_dim_count :: (load (<8 x s32>)) + %20:ep = COPY $p0 + %100:ed = COPY $d1 + dead %0:mwa, %20:ep, %100.sub_dim_count:ed = VLD_2D_pseudo killed %20, killed %100 :: (load (<8 x s32>) from unknown-address) + dead %2:mwa, dead %20:ep, dead %100.sub_dim_count:ed = VLD_2D_pseudo killed %20, killed %100 :: (load (<8 x s32>) from unknown-address) +... + + +--- +name: VLD_3D_MultiSlot +alignment: 16 +legalized: true +regBankSelected: true +selected: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $p0, $d1_3d + ; CHECK-LABEL: name: VLD_3D_MultiSlot + ; CHECK: liveins: $p0, $d1_3d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:eds = COPY $d1_3d + ; CHECK-NEXT: dead [[VLD_3D_pseudo_split:%[0-9]+]]:mwa, [[COPY:%[0-9]+]]:ep, [[COPY1:%[0-9]+]].sub_dim_count:eds, [[COPY1:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLD_3D_pseudo_split killed [[COPY]], killed [[COPY1]].sub_mod, killed [[COPY1]].sub_dim_size, killed [[COPY1]].sub_dim_stride, killed [[COPY1]].sub_dim_count, killed undef [[COPY1]].sub_hi_dim_then_sub_mod, killed [[COPY1]].sub_hi_dim_then_sub_dim_size, killed [[COPY1]].sub_hi_dim_then_sub_dim_stride, killed [[COPY1]].sub_hi_dim_then_sub_dim_count :: (load (<8 x s32>)) + ; CHECK-NEXT: dead [[VLD_3D_pseudo_split1:%[0-9]+]]:mwa, dead [[COPY:%[0-9]+]]:ep, dead [[COPY1:%[0-9]+]].sub_dim_count:eds, dead [[COPY1:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLD_3D_pseudo_split killed [[COPY]], killed [[COPY1]].sub_mod, killed [[COPY1]].sub_dim_size, killed [[COPY1]].sub_dim_stride, killed [[COPY1]].sub_dim_count, killed undef [[COPY1]].sub_hi_dim_then_sub_mod, killed [[COPY1]].sub_hi_dim_then_sub_dim_size, killed [[COPY1]].sub_hi_dim_then_sub_dim_stride, killed [[COPY1]].sub_hi_dim_then_sub_dim_count :: (load (<8 x s32>)) + %20:ep = COPY $p0 + %100:eds = COPY $d1_3d + dead %0:mwa, %20:ep, %100.sub_dim_count:eds, %100.sub_hi_dim_then_sub_dim_count:eds = VLD_3D_pseudo killed %20, killed %100 :: (load (<8 x s32>) from unknown-address) + dead %2:mwa, dead %20:ep, dead %100.sub_dim_count:eds, dead %100.sub_hi_dim_then_sub_dim_count:eds = VLD_3D_pseudo killed %20, killed %100 :: (load (<8 x s32>) from unknown-address) +... --- name: VLD_2D_CONV diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/resource/memory_bank_vld_multi_slot.mir b/llvm/test/CodeGen/AIE/aie2/schedule/resource/memory_bank_vld_multi_slot.mir new file mode 100644 index 000000000000..41b5eed7ce54 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/resource/memory_bank_vld_multi_slot.mir @@ -0,0 +1,1083 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -march=aie2 -run-pass=postmisched %s -o - | FileCheck %s + +# This test checks that we are scheduling mulislot loads from differnt banks in on VLIW bundle. + +--- +name: VLDA_VLDB_same_bankA +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankA + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 5) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_VLDB_same_bankB +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankB + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 6) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_same_bankC +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankC + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 7) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_same_bankD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 8) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_same_bankAB +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankAB + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 9) +... + +--- +name: VLDA_VLDB_same_bankAC +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankAC + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 10) +... + +--- +name: VLDA_VLDB_same_bankAD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankAD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 11) +... + +--- +name: VLDA_VLDB_same_bankBC +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankBC + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 12) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 12) +... + +--- +name: VLDA_VLDB_same_bankBD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankBD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 13) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 13) +... + +--- +name: VLDA_VLDB_same_bankCD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_same_bankCD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 14) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 14) +... + +--- +name: VLDA_VLDB_different_bank_A_and_B +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_A_and_B + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 5) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_different_bank_A_and_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_A_and_C + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 5) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_different_bank_A_and_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_A_and_D + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 5) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_different_bank_B_and_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_B_and_C + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 6) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_different_bank_B_and_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_B_and_D + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 6) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_different_bank_C_and_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_C_and_D + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 7) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + + +--- +name: VLDA_VLDB_different_bank_A_B_C_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_A_B_C_D + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: } + ; CHECK-NEXT: BUNDLE implicit-def $wh1, implicit-def $wh2, implicit killed $p0 { + ; CHECK-NEXT: $wh1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: $wh2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 5) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 6) + $wh1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 7) + $wh2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + + +--- +name: VLDA_VLDB_different_bank_AB_A +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_A + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_VLDB_different_bank_AB_B +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_B + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_different_bank_AB_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_C + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + + +--- +name: VLDA_VLDB_different_bank_AB_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_D + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_different_bank_AC_A +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_A + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_VLDB_different_bank_AC_B +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_B + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_different_bank_AC_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_C + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_different_bank_AC_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_D + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_different_bank_AD_A +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AD_A + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_VLDB_different_bank_AD_B +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AD_B + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_different_bank_AD_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AD_C + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_different_bank_AD_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AD_D + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_different_bank_BC_A +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BC_A + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 12) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_VLDB_different_bank_BC_B +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BC_B + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 12) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_different_bank_BC_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BC_C + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 12) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_different_bank_BC_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BC_D + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 12) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_different_bank_BD_A +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BD_A + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 13) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_VLDB_different_bank_BD_B +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BD_B + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 13) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_different_bank_BD_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BD_C + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 13) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_different_bank_BD_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BD_D + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 13) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + +--- +name: VLDA_VLDB_different_bank_CD_A +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_CD_A + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 14) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 5) +... + +--- +name: VLDA_VLDB_different_bank_CD_B +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_CD_B + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 14) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 6) +... + +--- +name: VLDA_VLDB_different_bank_CD_C +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_CD_C + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 14) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 7) +... + +--- +name: VLDA_VLDB_different_bank_CD_D +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_CD_D + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 14) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 8) +... + + +--- +name: VLDA_VLDB_different_bank_AB_AC +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_AC + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 10) +... + +--- +name: VLDA_VLDB_different_bank_AB_AD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_AD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 11) +... + +--- +name: VLDA_VLDB_different_bank_AB_BC +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_BC + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 12) +... + + +--- +name: VLDA_VLDB_different_bank_AB_BD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_BD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 13) +... + +--- +name: VLDA_VLDB_different_bank_AB_CD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AB_CD + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 9) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 9) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 14) +... + +--- +name: VLDA_VLDB_different_bank_AC_AD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_AD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 11) +... + +--- +name: VLDA_VLDB_different_bank_AC_BC +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_BC + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 12) +... + +--- +name: VLDA_VLDB_different_bank_AC_BD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_BD + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 13) +... + +--- +name: VLDA_VLDB_different_bank_AC_CD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AC_CD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 10) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 10) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 14) +... + +--- +name: VLDA_VLDB_different_bank_AD_BC +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AD_BC + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit killed $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 12) +... + +--- +name: VLDA_VLDB_different_bank_AD_BD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AD_BD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 13) +... + +--- +name: VLDA_VLDB_different_bank_AD_CD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_AD_CD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 11) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 11) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 14) +... + +--- +name: VLDA_VLDB_different_bank_BC_BD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BC_BD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 13) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 12) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 13) +... + +--- +name: VLDA_VLDB_different_bank_BC_CD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_BC_CD + ; CHECK: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 12) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 14) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 12) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 14) +... + + + +--- +name: VLDA_VLDB_different_bank_ABCD +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_different_bank_ABCD + ; CHECK: BUNDLE implicit-def $wl1, implicit-def $wl2, implicit $p0 { + ; CHECK-NEXT: $wl1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 5) + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 6) + ; CHECK-NEXT: } + ; CHECK-NEXT: BUNDLE implicit-def $wh1, implicit-def $wh2, implicit killed $p0 { + ; CHECK-NEXT: $wh1 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 :: (load (<8 x s32>), addrspace 7) + ; CHECK-NEXT: $wh2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 :: (load (<8 x s32>), addrspace 8) + ; CHECK-NEXT: } + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 5) + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 6) + $wh1 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 7) + $wh2 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<8 x s32>), addrspace 8) +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/vld.mir b/llvm/test/CodeGen/AIE/aie2/schedule/vld.mir index 8b92b5ea84e7..f018433a8c00 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/vld.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/vld.mir @@ -675,3 +675,327 @@ body: | $dc1 = MOV_mv_scl $dc0 $x0 = VADD_8 $x0, $x1 ... + +--- +name: multi_slot_latency_vlda_mod +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_slot_latency_vlda_mod + ; CHECK: $wl0 = VLDB_dmw_ldb_ag_idx killed $p0, killed $dj0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $x0 = VADD_8 killed $x0, killed $x1 + ; CHECK-NEXT: NOP + $wl0 = VLD_idx_pseudo $p0, $dj0 + $x0 = VADD_8 $x0, $x1 +... + +--- +name: multi_slot_latency_vlda_w +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_slot_latency_vlda_w + ; CHECK: $wl0 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $x0 = VADD_8 killed $x0, killed $x1 + ; CHECK-NEXT: NOP + $wl0 = VLD_idx_imm_3x32_pseudo $p0, 0 + $x0 = VADD_8 $x0, $x1 +... + +--- +name: multi_slot_latency_vlda_post_inc_w +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_slot_latency_vlda_post_inc_w + ; CHECK: $wl0, $p0 = VLDB_dmw_ldb_ag_pstm_nrm_imm killed $p0, 0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $x0 = VADD_8 killed $x0, killed $x1 + ; CHECK-NEXT: NOP + $wl0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 0 + $x0 = VADD_8 $x0, $x1 +... + +--- +name: multi_slot_latency_vlda_2d_w +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_slot_latency_vlda_2d_w + ; CHECK: $wl0, $p0, $dc0 = VLDB_2D killed $p0, killed $d0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $x0 = VADD_8 killed $x0, killed $x1 + ; CHECK-NEXT: NOP + $wl0, $p0, $dc0 = VLD_2D_pseudo $p0, $d0 + $x0 = VADD_8 $x0, $x1 +... + +--- +name: multi_slot_latency_vldb_mod +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_slot_latency_vldb_mod + ; CHECK: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 + ; CHECK-NEXT: $wl0 = VLDB_dmw_ldb_ag_idx killed $p1, killed $dj0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $x0 = VADD_8 killed $x0, killed $x1 + ; CHECK-NEXT: NOP + $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 + $wl0 = VLD_idx_pseudo $p1, $dj0 + $x0 = VADD_8 $x0, $x1 +... + +--- +name: multi_slot_latency_vldb_imm +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_slot_latency_vldb_imm + ; CHECK: $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 + ; CHECK-NEXT: $wl0 = VLDB_dmw_ldb_ag_idx_imm killed $p1, 0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $x0 = VADD_8 killed $x0, killed $x1 + ; CHECK-NEXT: NOP + $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 + $wl0 = VLD_idx_imm_3x32_pseudo $p1, 0 + $x0 = VADD_8 $x0, $x1 +... + +--- +name: multi_slot_latency_vldb_post_inc_mod +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_slot_latency_vldb_post_inc_mod + ; CHECK: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $wl0, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, killed $m0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $x0 = VADD_8 killed $x0, killed $x1 + ; CHECK-NEXT: NOP + $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 + $wl0, $p0 = VLD_pstm_pseudo $p0, $m0 + $x0 = VADD_8 $x0, $x1 +... + +--- +name: multi_slot_latency_vldb_post_inc_imm +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_slot_latency_vldb_post_inc_imm + ; CHECK: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $wl0, $p0 = VLDB_dmw_ldb_ag_pstm_nrm_imm killed $p0, 0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $x0 = VADD_8 killed $x0, killed $x1 + ; CHECK-NEXT: NOP + $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 + $wl0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 0 + $x0 = VADD_8 $x0, $x1 +... + + +--- +name: multi_slot_latency_vldb_2d +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_slot_latency_vldb_2d + ; CHECK: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $wl0, $p0, $dc0 = VLDB_2D killed $p0, killed $d0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $x0 = VADD_8 killed $x0, killed $x1 + ; CHECK-NEXT: NOP + $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 + $wl0, $p0, $dc0 = VLD_2D_pseudo $p0, $d0 + $x0 = VADD_8 $x0, $x1 +... + +--- +name: multi_slot_latency_vlda_mod1 +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_slot_latency_vlda_mod1 + ; CHECK: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + ; CHECK-NEXT: $wl0 = VLDB_dmw_ldb_ag_idx killed $p1, killed $dj0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $x0 = VADD_8 killed $x0, killed $x1 + ; CHECK-NEXT: NOP + $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + $wl0 = VLD_idx_pseudo $p1, $dj0 + $x0 = VADD_8 $x0, $x1 +... + +--- +name: multi_slot_latency_vlda_imm +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_slot_latency_vlda_imm + ; CHECK: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + ; CHECK-NEXT: $wl0 = VLDB_dmw_ldb_ag_idx_imm killed $p1, 0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $x0 = VADD_8 killed $x0, killed $x1 + ; CHECK-NEXT: NOP + $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + $wl0 = VLD_idx_imm_3x32_pseudo $p1, 0 + $x0 = VADD_8 $x0, $x1 +... + +--- +name: multi_slot_latency_vlda_post_inc_mod +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_slot_latency_vlda_post_inc_mod + ; CHECK: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $wl0, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, killed $m0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $x0 = VADD_8 killed $x0, killed $x1 + ; CHECK-NEXT: NOP + $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + $wl0, $p0 = VLD_pstm_pseudo $p0, $m0 + $x0 = VADD_8 $x0, $x1 +... + +--- +name: multi_slot_latency_vlda_post_inc_imm +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_slot_latency_vlda_post_inc_imm + ; CHECK: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $wl0, $p0 = VLDB_dmw_ldb_ag_pstm_nrm_imm killed $p0, 0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $x0 = VADD_8 killed $x0, killed $x1 + ; CHECK-NEXT: NOP + $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + $wl0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 0 + $x0 = VADD_8 $x0, $x1 +... + + +--- +name: multi_slot_latency_vlda_2d +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_slot_latency_vlda_2d + ; CHECK: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $wl0, $p0, $dc0 = VLDB_2D killed $p0, killed $d0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $x0 = VADD_8 killed $x0, killed $x1 + ; CHECK-NEXT: NOP + $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + $wl0, $p0, $dc0 = VLD_2D_pseudo $p0, $d0 + $x0 = VADD_8 $x0, $x1 +... + +--- +name: latency_vlda_3d_pseudo +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: latency_vlda_3d_pseudo + ; CHECK: $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $wl0, $p0, $dc0, $dc4 = VLDB_3D killed $p0, killed $d0_3d + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl2 = VLDA_dmw_lda_w_ag_idx_imm killed $p0, 0 + $wl0, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d +... + +--- +name: latency_vldb_3d_pseudo +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: latency_vldb_3d_pseudo + ; CHECK: $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $wl0, $p0, $dc0, $dc4 = VLDB_3D killed $p0, killed $d0_3d + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + $wl0, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb.mir b/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb.mir index 900a762b23dd..742ba3da9bc3 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/vlda_vldb.mir @@ -89,3 +89,22 @@ body: | $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 $wl2 = VLDA_dmw_lda_w_ag_idx_imm $p0, 0 ... + +--- +name: VLDA_VLDB_pseudo +alignment: 16 +body: | + bb.0.entry: + ; CHECK-LABEL: name: VLDA_VLDB_pseudo + ; CHECK: VST_dmw_sts_w_ag_idx_imm killed $wl0, $p0, 0 + ; CHECK-NEXT: $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 0 + ; CHECK-NEXT: $wl2 = VLDB_dmw_ldb_ag_idx_imm killed $p0, 0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + VST_dmw_sts_w_ag_idx_imm $wl0, $p0, 0 + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 0 + $wl2 = VLD_idx_imm_3x32_pseudo $p0, 0 diff --git a/llvm/test/CodeGen/AIE/aie2/test-alignas.ll b/llvm/test/CodeGen/AIE/aie2/test-alignas.ll index 7973209a620f..884b3a3e6402 100644 --- a/llvm/test/CodeGen/AIE/aie2/test-alignas.ll +++ b/llvm/test/CodeGen/AIE/aie2/test-alignas.ll @@ -33,7 +33,7 @@ define <16 x i16> @_Z12test_alignasv() { ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda wh0, [p0, #0] +; CHECK-NEXT: vldb wh0, [p0, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: lda lr, [sp, #-2048] // 4-byte Folded Reload ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/verifier/tied-physical-regs-match-vld-2d.mir b/llvm/test/CodeGen/AIE/aie2/verifier/tied-physical-regs-match-vld-2d.mir index 4a79634ac788..2a8af88edc6d 100644 --- a/llvm/test/CodeGen/AIE/aie2/verifier/tied-physical-regs-match-vld-2d.mir +++ b/llvm/test/CodeGen/AIE/aie2/verifier/tied-physical-regs-match-vld-2d.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 # # This file is licensed under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. @@ -25,6 +26,7 @@ body: | $cm5, $p3, $dc2 = VLDA_2D_UPS_S64_S16 $s2, $p3, $d2, implicit-def $srups_of, implicit $crsat $cm7, $p5, $dc0 = VLDA_2D_UPS_S64_D16 $s3, $p5, $d0, implicit-def $srups_of, implicit $crsat, implicit $crupssign $wl0, $p0, $dc0 = VLDB_2D $p0, $d0 + $wl0, $p0, $dc0 = VLD_2D_pseudo $p0, $d0 $bml0, $p0, $dc0 = VLDA_2D_CONV_FP32_BF16 $p0, $d0 $x0, $p0, $dc4 = VLDB_2D_UNPACK_S8_S4 $p0, $d4 $x2, $p2, $dc3 = VLDB_2D_UNPACK_S16_S8 $p2, $d3 @@ -51,6 +53,7 @@ body: | $cm5, %100:ep, %101:edc = VLDA_2D_UPS_S64_S16 $s2, %0, %10, implicit-def $srups_of, implicit $crsat $cm7, %110:ep, %111:edc = VLDA_2D_UPS_S64_D16 $s3, %0, %10, implicit-def $srups_of, implicit $crsat, implicit $crupssign $wl0, %120:ep, %121:edc = VLDB_2D %0, %10 + $wl0, %170:ep, %171:edc = VLD_2D_pseudo %0, %10 $bml0, %130:ep, %131:edc = VLDA_2D_CONV_FP32_BF16 %0, %10 $x0, %140:ep, %141:edc = VLDB_2D_UNPACK_S8_S4 %0, %10 $x2, %150:ep, %151:edc = VLDB_2D_UNPACK_S16_S8 %0, %10 @@ -64,7 +67,7 @@ alignment: 16 body: | bb.0 (align 16): ; CHECK-NOT: Bad machine code - ; CHECK-COUNT-16: Bad machine code: Tied physical registers must match + ; CHECK-COUNT-18: Bad machine code: Tied physical registers must match $wl0, $p0, $dc1 = VLDA_2D_dmw_lda_w $p0, $d0 $amll0, $p0, $dc2 = VLDA_2D_dmw_lda_am $p0, $d0 $bmh0, $p0, $dc3 = VLDA_2D_UPS_S32_D16 $s0, $p0, $d4, implicit-def $srups_of, implicit $crsat, implicit $crupssign @@ -76,6 +79,7 @@ body: | $cm5, $p3, $dc5 = VLDA_2D_UPS_S64_S16 $s2, $p3, $d4, implicit-def $srups_of, implicit $crsat $cm7, $p5, $dc4 = VLDA_2D_UPS_S64_D16 $s3, $p5, $d3, implicit-def $srups_of, implicit $crsat, implicit $crupssign $wl0, $p0, $dc3 = VLDB_2D $p0, $d0 + $wl0, $p0, $dc1 = VLD_2D_pseudo $p0, $d0 $bml0, $p0, $dc2 = VLDA_2D_CONV_FP32_BF16 $p0, $d0 $x0, $p0, $dc7 = VLDB_2D_UNPACK_S8_S4 $p0, $d6 $x2, $p2, $dc6 = VLDB_2D_UNPACK_S16_S8 $p2, $d5 @@ -102,4 +106,7 @@ body: | $x2, $p2, renamable $dc3 = VLDB_2D_UNPACK_S16_S8 $p2, renamable $d3 $x5, $p3, renamable $dc2 = VLDB_2D_UNPACK_D8_D4 $p3, renamable $d2, implicit $crunpacksign $x7, $p5, renamable $dc0 = VLDB_2D_UNPACK_D16_D8 $p5, renamable $d0, implicit $crunpacksign + + $wl0, $p0, renamable $dc2 = VLD_2D_pseudo $p0, $d1 + $wl0, $p0, $dc1 = VLD_2D_pseudo $p0, $d1 ... diff --git a/llvm/test/CodeGen/AIE/aie2/verifier/tied-physical-regs-match-vld-3d.mir b/llvm/test/CodeGen/AIE/aie2/verifier/tied-physical-regs-match-vld-3d.mir index 17b711f9290e..a1abf722d274 100644 --- a/llvm/test/CodeGen/AIE/aie2/verifier/tied-physical-regs-match-vld-3d.mir +++ b/llvm/test/CodeGen/AIE/aie2/verifier/tied-physical-regs-match-vld-3d.mir @@ -14,6 +14,7 @@ name: tied_ok alignment: 16 body: | bb.0 (align 16): + $wl0, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d $wl0, $p0, $dc0, $dc4 = VLDA_3D_dmw_lda_w $p0, $d0_3d $amll0, $p0, $dc0, $dc4 = VLDA_3D_dmw_lda_am $p0, $d0_3d $bmh0, $p0, $dc0, $dc4 = VLDA_3D_UPS_S32_D16 $s0, $p0, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign @@ -40,6 +41,7 @@ body: | bb.0 (align 16): %0:ep = COPY $p0 %10:eds = COPY $d0_3d + $wl0, %20:ep, %21:edc, %22:edc = VLD_3D_pseudo %0, %10 $wl0, %20:ep, %21:edc, %22:edc = VLDA_3D_dmw_lda_w %0, %10 $amll0, %30:ep, %31:edc, %32:edc = VLDA_3D_dmw_lda_am %0, %10 $bmh0, %40:ep, %41:edc, %42:edc = VLDA_3D_UPS_S32_D16 $s0, %0, %10, implicit-def $srups_of, implicit $crsat, implicit $crupssign @@ -64,7 +66,8 @@ alignment: 16 body: | bb.0 (align 16): ; CHECK-NOT: Bad machine code - ; CHECK-COUNT-16: Bad machine code: Tied physical registers must match + ; CHECK-COUNT-17: Bad machine code: Tied physical registers must match + $wl0, $p0, $dc1, $dc3 = VLD_3D_pseudo $p0, $d1_3d $wl0, $p0, $dc1, $dc3 = VLDA_3D_dmw_lda_w $p0, $d1_3d $wl0, $p0, $dc1, $dc3 = VLDA_3D_dmw_lda_am $p0, $d1_3d $bmh0, $p0, $dc0, $dc2 = VLDA_3D_UPS_S32_D16 $s0, $p0, $d3_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign diff --git a/llvm/test/CodeGen/AIE/aie2/vst_srs.ll b/llvm/test/CodeGen/AIE/aie2/vst_srs.ll index 90f64779fd2b..a91417a6e0fb 100644 --- a/llvm/test/CodeGen/AIE/aie2/vst_srs.ll +++ b/llvm/test/CodeGen/AIE/aie2/vst_srs.ll @@ -384,7 +384,7 @@ define dso_local noundef <16 x i16> @test_postincrement(ptr %array, <8 x i64> no ; CHECK-NEXT: vst.srs.d16.s32 bml0, s0, [p0], #32 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda wl0, [p0, #0] +; CHECK-NEXT: vldb wl0, [p0, #0] ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 From 9feca8509d17c4ac26604b501e3ff61d9f140dc4 Mon Sep 17 00:00:00 2001 From: Krishnam Tibrewala Date: Thu, 15 Aug 2024 11:56:33 -0700 Subject: [PATCH 26/31] [AIE2] NFC : Refactor checkImmediateRange*(...) function --- .../Target/AIE/AIE2InstructionSelector.cpp | 177 +++++++++--------- 1 file changed, 89 insertions(+), 88 deletions(-) diff --git a/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp b/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp index 62616639c977..715014139589 100644 --- a/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp +++ b/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp @@ -1467,9 +1467,10 @@ unsigned getLoadStoreSize(const MachineInstr &MI) { return (*MI.memoperands_begin())->getSizeInBits().getValue(); } -template +template bool checkImmediateRange(std::optional Immediate) { - if (Immediate && isInt(Immediate->getSExtValue()) && + unsigned MaxPow2 = NumEncodingBits + llvm::Log2_64(Step); + if (Immediate && isIntN(MaxPow2, Immediate->getSExtValue()) && Immediate->getSExtValue() % Step == 0) { LLVM_DEBUG(dbgs() << "Immediate " << Immediate << " is valid for MaxPow2 " << MaxPow2 << " and Step " << Step << ".\n"); @@ -1478,10 +1479,10 @@ bool checkImmediateRange(std::optional Immediate) { return false; } -template +template bool checkImmediateRangeSplitting(std::optional Immediate) { - return Immediate && checkImmediateRange(Immediate) && - checkImmediateRange(*Immediate + SplitOffset); + return Immediate && checkImmediateRange(Immediate) && + checkImmediateRange(*Immediate + SplitOffset); } std::optional getCombinedOpcodeUNPACKLoad( @@ -1739,14 +1740,14 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_I512_v32_acc32_srs: FitsImmediateRange = - checkImmediateRangeSplitting<8, 32, 32>(Immediate); + checkImmediateRangeSplitting<3, 32, 32>(Immediate); return LoadStoreOpcodes{ /*ISelOpcode=*/AIE2::VST_SRS_S16_S32_ag_idx_imm, FitsImmediateRange, /*OffsetOpcode=*/AIE2::VST_SRS_S16_S32_ag_idx_imm}; case Intrinsic::aie2_I512_v16_acc64_srs: FitsImmediateRange = - checkImmediateRangeSplitting<8, 32, 32>(Immediate); + checkImmediateRangeSplitting<3, 32, 32>(Immediate); return LoadStoreOpcodes{ /*ISelOpcode=*/AIE2::VST_SRS_S32_S64_ag_idx_imm, FitsImmediateRange, @@ -1756,25 +1757,25 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, if (getLoadStoreSize(MemOp) == 256) { switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_I256_v16_acc32_srs: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_S16_S32_ag_idx_imm : AIE2::VST_SRS_S16_S32_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_I256_v16_acc64_srs: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_S16_S64_ag_idx_imm : AIE2::VST_SRS_S16_S64_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_I256_v32_acc32_srs: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_S8_S32_ag_idx_imm : AIE2::VST_SRS_S8_S32_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_I256_v8_acc64_srs: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_S32_S64_ag_idx_imm : AIE2::VST_SRS_S32_S64_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, @@ -1786,7 +1787,7 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, if (getLoadStoreSize(MemOp) == 512) { switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_I512_v32_acc32_srs: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_S16_S32_ag_pstm_nrm_imm : AIE2::VST_SRS_S16_S32_ag_pstm_nrm; @@ -1794,7 +1795,7 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/AIE2::VST_SRS_S16_S32_ag_idx_imm}; case Intrinsic::aie2_I512_v16_acc64_srs: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_S32_S64_ag_pstm_nrm_imm : AIE2::VST_SRS_S32_S64_ag_pstm_nrm; @@ -1806,27 +1807,27 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, if (getLoadStoreSize(MemOp) == 256) { switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_I256_v16_acc32_srs: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_S16_S32_ag_pstm_nrm_imm : AIE2::VST_SRS_S16_S32_ag_pstm_nrm; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_I256_v16_acc64_srs: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_S16_S64_ag_pstm_nrm_imm : AIE2::VST_SRS_S16_S64_ag_pstm_nrm; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_I256_v32_acc32_srs: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_S8_S32_ag_pstm_nrm_imm : AIE2::VST_SRS_S8_S32_ag_pstm_nrm; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_I256_v8_acc64_srs: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_S32_S64_ag_pstm_nrm_imm : AIE2::VST_SRS_S32_S64_ag_pstm_nrm; @@ -1944,14 +1945,14 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_acc64_v16_I512_ups: FitsImmediateRange = - checkImmediateRangeSplitting<8, 32, 32>(Immediate); + checkImmediateRangeSplitting<3, 32, 32>(Immediate); return LoadStoreOpcodes{ /*ISelOpcode=*/AIE2::VLDA_UPS_S64_S32_ag_idx_imm, FitsImmediateRange, /*OffsetOpcode=*/AIE2::VLDA_UPS_S64_S32_ag_idx_imm}; case Intrinsic::aie2_acc32_v32_I512_ups: FitsImmediateRange = - checkImmediateRangeSplitting<8, 32, 32>(Immediate); + checkImmediateRangeSplitting<3, 32, 32>(Immediate); return LoadStoreOpcodes{ /*ISelOpcode=*/AIE2::VLDA_UPS_S32_S16_ag_idx_imm, FitsImmediateRange, @@ -1961,25 +1962,25 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, if (getLoadStoreSize(MemOp) == 256) { switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_acc32_v16_I256_ups: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S32_S16_ag_idx_imm : AIE2::VLDA_UPS_S32_S16_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_acc64_v16_I256_ups: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S64_S16_ag_idx_imm : AIE2::VLDA_UPS_S64_S16_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_acc32_v32_I256_ups: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S32_S8_ag_idx_imm : AIE2::VLDA_UPS_S32_S8_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_acc64_v8_I256_ups: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S64_S32_ag_idx_imm : AIE2::VLDA_UPS_S64_S32_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, @@ -1991,7 +1992,7 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, if (getLoadStoreSize(MemOp) == 512) { switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_acc64_v16_I512_ups: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S64_S32_ag_pstm_nrm_imm : AIE2::VLDA_UPS_S64_S32_ag_pstm_nrm; @@ -1999,7 +2000,7 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/AIE2::VLDA_UPS_S64_S32_ag_idx_imm}; case Intrinsic::aie2_acc32_v32_I512_ups: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S32_S16_ag_pstm_nrm_imm : AIE2::VLDA_UPS_S32_S16_ag_pstm_nrm; @@ -2011,28 +2012,28 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, if (getLoadStoreSize(MemOp) == 256) { switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_acc32_v16_I256_ups: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S32_S16_ag_pstm_nrm_imm : AIE2::VLDA_UPS_S32_S16_ag_pstm_nrm; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_acc64_v16_I256_ups: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S64_S16_ag_pstm_nrm_imm : AIE2::VLDA_UPS_S64_S16_ag_pstm_nrm; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_acc32_v32_I256_ups: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S32_S8_ag_pstm_nrm_imm : AIE2::VLDA_UPS_S32_S8_ag_pstm_nrm; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_acc64_v8_I256_ups: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S64_S32_ag_pstm_nrm_imm : AIE2::VLDA_UPS_S64_S32_ag_pstm_nrm; @@ -2154,14 +2155,14 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_I512_v32_acc32_srs: FitsImmediateRange = - checkImmediateRangeSplitting<8, 32, 32>(Immediate); + checkImmediateRangeSplitting<3, 32, 32>(Immediate); return LoadStoreOpcodes{ /*ISelOpcode=*/AIE2::VST_SRS_D16_S32_ag_idx_imm, FitsImmediateRange, /*OffsetOpcode=*/AIE2::VST_SRS_D16_S32_ag_idx_imm}; case Intrinsic::aie2_I512_v16_acc64_srs: FitsImmediateRange = - checkImmediateRangeSplitting<8, 32, 32>(Immediate); + checkImmediateRangeSplitting<3, 32, 32>(Immediate); return LoadStoreOpcodes{ /*ISelOpcode=*/AIE2::VST_SRS_D32_S64_ag_idx_imm, FitsImmediateRange, @@ -2171,25 +2172,25 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, if (getLoadStoreSize(MemOp) == 256) { switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_I256_v16_acc32_srs: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_D16_S32_ag_idx_imm : AIE2::VST_SRS_D16_S32_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_I256_v16_acc64_srs: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_D16_S64_ag_idx_imm : AIE2::VST_SRS_D16_S64_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_I256_v32_acc32_srs: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_D8_S32_ag_idx_imm : AIE2::VST_SRS_D8_S32_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_I256_v8_acc64_srs: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_D32_S64_ag_idx_imm : AIE2::VST_SRS_D32_S64_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, @@ -2201,7 +2202,7 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, if (getLoadStoreSize(MemOp) == 512) { switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_I512_v32_acc32_srs: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_D16_S32_ag_pstm_nrm_imm : AIE2::VST_SRS_D16_S32_ag_pstm_nrm; @@ -2209,7 +2210,7 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/AIE2::VST_SRS_D16_S32_ag_idx_imm}; case Intrinsic::aie2_I512_v16_acc64_srs: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_D32_S64_ag_pstm_nrm_imm : AIE2::VST_SRS_D32_S64_ag_pstm_nrm; @@ -2221,27 +2222,27 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, if (getLoadStoreSize(MemOp) == 256) { switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_I256_v16_acc32_srs: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_D16_S32_ag_pstm_nrm_imm : AIE2::VST_SRS_D16_S32_ag_pstm_nrm; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_I256_v16_acc64_srs: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_D16_S64_ag_pstm_nrm_imm : AIE2::VST_SRS_D16_S64_ag_pstm_nrm; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_I256_v32_acc32_srs: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_D8_S32_ag_pstm_nrm_imm : AIE2::VST_SRS_D8_S32_ag_pstm_nrm; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_I256_v8_acc64_srs: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_SRS_D32_S64_ag_pstm_nrm_imm : AIE2::VST_SRS_D32_S64_ag_pstm_nrm; @@ -2359,14 +2360,14 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_acc64_v16_I512_ups: FitsImmediateRange = - checkImmediateRangeSplitting<8, 32, 32>(Immediate); + checkImmediateRangeSplitting<3, 32, 32>(Immediate); return LoadStoreOpcodes{ /*ISelOpcode=*/AIE2::VLDA_UPS_S64_D32_ag_idx_imm, FitsImmediateRange, /*OffsetOpcode=*/AIE2::VLDA_UPS_S64_D32_ag_idx_imm}; case Intrinsic::aie2_acc32_v32_I512_ups: FitsImmediateRange = - checkImmediateRangeSplitting<8, 32, 32>(Immediate); + checkImmediateRangeSplitting<3, 32, 32>(Immediate); return LoadStoreOpcodes{ /*ISelOpcode=*/AIE2::VLDA_UPS_S32_D16_ag_idx_imm, FitsImmediateRange, @@ -2376,25 +2377,25 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, if (getLoadStoreSize(MemOp) == 256) { switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_acc32_v16_I256_ups: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S32_D16_ag_idx_imm : AIE2::VLDA_UPS_S32_D16_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_acc64_v16_I256_ups: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S64_D16_ag_idx_imm : AIE2::VLDA_UPS_S64_D16_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_acc32_v32_I256_ups: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S32_D8_ag_idx_imm : AIE2::VLDA_UPS_S32_D8_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_acc64_v8_I256_ups: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S64_D32_ag_idx_imm : AIE2::VLDA_UPS_S64_D32_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, @@ -2406,7 +2407,7 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, if (getLoadStoreSize(MemOp) == 512) { switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_acc64_v16_I512_ups: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S64_D32_ag_pstm_nrm_imm : AIE2::VLDA_UPS_S64_D32_ag_pstm_nrm; @@ -2414,7 +2415,7 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/AIE2::VLDA_UPS_S64_D32_ag_idx_imm}; case Intrinsic::aie2_acc32_v32_I512_ups: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S32_D16_ag_pstm_nrm_imm : AIE2::VLDA_UPS_S32_D16_ag_pstm_nrm; @@ -2426,28 +2427,28 @@ getCombinedOpcodeSRSUPS(const MachineInstr &MemOp, const MachineInstr &CombOp, if (getLoadStoreSize(MemOp) == 256) { switch (cast(CombOp).getIntrinsicID()) { case Intrinsic::aie2_acc32_v16_I256_ups: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S32_D16_ag_pstm_nrm_imm : AIE2::VLDA_UPS_S32_D16_ag_pstm_nrm; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_acc64_v16_I256_ups: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S64_D16_ag_pstm_nrm_imm : AIE2::VLDA_UPS_S64_D16_ag_pstm_nrm; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_acc32_v32_I256_ups: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S32_D8_ag_pstm_nrm_imm : AIE2::VLDA_UPS_S32_D8_ag_pstm_nrm; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case Intrinsic::aie2_acc64_v8_I256_ups: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_UPS_S64_D32_ag_pstm_nrm_imm : AIE2::VLDA_UPS_S64_D32_ag_pstm_nrm; @@ -3324,13 +3325,13 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( if (getLoadStoreSize(I) == 512) { unsigned RBID = deriveRegBankID(I.getOperand(0).getReg(), MRI, RBI); if (RBID == AIE2::AccRegBankID) { - FitsImmediateRange = checkImmediateRangeSplitting<11, 32, 32>(Offset); + FitsImmediateRange = checkImmediateRangeSplitting<6, 32, 32>(Offset); return {/*ISelOpcode=*/AIE2::VST_dmw_sts_am_ag_idx_imm, FitsImmediateRange, /*OffsetOpcode=*/AIE2::VST_dmw_sts_am_ag_idx_imm}; } if (RBID == AIE2::VRegBankID) { - FitsImmediateRange = checkImmediateRangeSplitting<11, 32, 32>(Offset); + FitsImmediateRange = checkImmediateRangeSplitting<6, 32, 32>(Offset); return {/*ISelOpcode=*/AIE2::VST_dmw_sts_w_ag_idx_imm, FitsImmediateRange, /*OffsetOpcode=*/AIE2::VST_dmw_sts_w_ag_idx_imm}; @@ -3340,14 +3341,14 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( if (getLoadStoreSize(I) == 256) { unsigned RBID = deriveRegBankID(I.getOperand(0).getReg(), MRI, RBI); if (RBID == AIE2::AccRegBankID) { - FitsImmediateRange = checkImmediateRange<11, 32>(Offset); + FitsImmediateRange = checkImmediateRange<6, 32>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::VST_dmw_sts_am_ag_idx_imm : AIE2::VST_dmw_sts_am_ag_idx; return {ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/AIE2::VST_dmw_sts_am_ag_idx_imm}; } if (RBID == AIE2::VRegBankID) { - FitsImmediateRange = checkImmediateRange<11, 32>(Offset); + FitsImmediateRange = checkImmediateRange<6, 32>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::VST_dmw_sts_w_ag_idx_imm : AIE2::VST_dmw_sts_w_ag_idx; return {ISelOpcode, FitsImmediateRange, @@ -3356,13 +3357,13 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( llvm_unreachable("Vector type not in AccRegBank nor VRegBank"); } if (getLoadStoreSize(I) == 128) { - FitsImmediateRange = checkImmediateRange<10, 16>(Offset); + FitsImmediateRange = checkImmediateRange<6, 16>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::ST_dmv_sts_q_ag_idx_imm : AIE2::ST_dmv_sts_q_ag_idx; return {ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; } if (getLoadStoreSize(I) == 20 || getLoadStoreSize(I) == 32) { - FitsImmediateRange = checkImmediateRange<8, 4>(Offset); + FitsImmediateRange = checkImmediateRange<6, 4>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::ST_dms_sts_idx_imm : AIE2::ST_dms_sts_idx; return {ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; @@ -3384,7 +3385,7 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( if (getLoadStoreSize(I) == 128) { unsigned RBID = deriveRegBankID(I.getOperand(1).getReg(), MRI, RBI); if (RBID == AIE2::VRegBankID) { - FitsImmediateRange = checkImmediateRange<11, 16>(Offset); + FitsImmediateRange = checkImmediateRange<7, 16>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::ST_dmv_sts_q_ag_pstm_nrm_imm : AIE2::ST_dmv_sts_q_ag_pstm_nrm; return {ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; @@ -3394,14 +3395,14 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( if (getLoadStoreSize(I) == 256 || getLoadStoreSize(I) == 512) { unsigned RBID = deriveRegBankID(I.getOperand(1).getReg(), MRI, RBI); if (RBID == AIE2::AccRegBankID) { - FitsImmediateRange = checkImmediateRange<12, 32>(Offset); + FitsImmediateRange = checkImmediateRange<7, 32>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::VST_dmw_sts_am_ag_pstm_nrm_imm : AIE2::VST_dmw_sts_am_ag_pstm_nrm; return {ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/AIE2::VST_dmw_sts_am_ag_idx_imm}; } if (RBID == AIE2::VRegBankID) { - FitsImmediateRange = checkImmediateRange<12, 32>(Offset); + FitsImmediateRange = checkImmediateRange<7, 32>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::VST_dmw_sts_w_ag_pstm_nrm_imm : AIE2::VST_dmw_sts_w_ag_pstm_nrm; return {ISelOpcode, FitsImmediateRange, @@ -3410,7 +3411,7 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( llvm_unreachable("Vector type not in AccRegBank nor VRegBank"); } if (getLoadStoreSize(I) == 20 || getLoadStoreSize(I) == 32) { - FitsImmediateRange = checkImmediateRange<9, 4>(Offset); + FitsImmediateRange = checkImmediateRange<7, 4>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::ST_dms_sts_pstm_nrm_imm : AIE2::ST_dms_sts_pstm_nrm; return {ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; @@ -3551,7 +3552,7 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( if (getLoadStoreSize(I) == 512) { unsigned RBID = deriveRegBankID(I.getOperand(0).getReg(), MRI, RBI); if (RBID == AIE2::AccRegBankID) { - FitsImmediateRange = checkImmediateRangeSplitting<11, 32, 32>(Offset); + FitsImmediateRange = checkImmediateRangeSplitting<6, 32, 32>(Offset); return {/*ISelOpcode=*/AIE2::VLDA_dmw_lda_am_ag_idx_imm, FitsImmediateRange, /*OffsetOpcode=*/AIE2::VLDA_dmw_lda_am_ag_idx_imm}; @@ -3560,23 +3561,23 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( unsigned OffsetOpcode; // First try if the Instruction can be selected as multi-slot offset // load - if (checkImmediateRangeSplitting<8, 32, 32>(Offset)) { + if (checkImmediateRangeSplitting<3, 32, 32>(Offset)) { FitsImmediateRange = true; ISelOpcode = OffsetOpcode = AIE2::VLD_idx_imm_3x32_pseudo; - } else if (checkImmediateRange<8, 32>(Offset)) { + } else if (checkImmediateRange<3, 32>(Offset)) { // When Offset is positive and one of the offset is in range of SlotB ISelOpcode = AIE2::VLD_idx_imm_3x32_pseudo; OffsetOpcode = AIE2::VLDA_dmw_lda_w_ag_idx_imm; FitsImmediateRange = true; } else if (Offset.has_value() && (*Offset).isNegative() && - checkImmediateRange<8, 32>((*Offset) + 32)) { + checkImmediateRange<3, 32>((*Offset) + 32)) { // When Offset is negative and one of the offset is in range of SlotB ISelOpcode = AIE2::VLDA_dmw_lda_w_ag_idx_imm; OffsetOpcode = AIE2::VLD_idx_imm_3x32_pseudo; FitsImmediateRange = true; } else { // When Offset & Offset+32 are out of range of SlotB - FitsImmediateRange = checkImmediateRangeSplitting<11, 32, 32>(Offset); + FitsImmediateRange = checkImmediateRangeSplitting<6, 32, 32>(Offset); ISelOpcode = OffsetOpcode = AIE2::VLDA_dmw_lda_w_ag_idx_imm; } return {/*ISelOpcode=*/ISelOpcode, FitsImmediateRange, @@ -3587,7 +3588,7 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( if (getLoadStoreSize(I) == 256) { unsigned RBID = deriveRegBankID(I.getOperand(0).getReg(), MRI, RBI); if (RBID == AIE2::AccRegBankID) { - FitsImmediateRange = checkImmediateRange<11, 32>(Offset); + FitsImmediateRange = checkImmediateRange<6, 32>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_dmw_lda_am_ag_idx_imm : AIE2::VLDA_dmw_lda_am_ag_idx; return {ISelOpcode, FitsImmediateRange, @@ -3596,11 +3597,11 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( if (RBID == AIE2::VRegBankID) { // First try if the Instruction can be selected as multi-slot offset // load - if (checkImmediateRange<8, 32>(Offset)) { + if (checkImmediateRange<3, 32>(Offset)) { FitsImmediateRange = true; ISelOpcode = AIE2::VLD_idx_imm_3x32_pseudo; } else { - FitsImmediateRange = checkImmediateRange<11, 32>(Offset); + FitsImmediateRange = checkImmediateRange<6, 32>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_dmw_lda_w_ag_idx_imm : AIE2::VLD_idx_pseudo; } @@ -3615,13 +3616,13 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( * which instruction to select between the available LDA_dmv_lda_q_ag_idx * which has 128-bit destination operand vs VLDB_128_ag_idx which has * 256-bit destination operand. */ - FitsImmediateRange = checkImmediateRange<10, 16>(Offset); + FitsImmediateRange = checkImmediateRange<6, 16>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::LDA_dmv_lda_q_ag_idx_imm : AIE2::VLDB_128_ag_idx; return {ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; } if (getLoadStoreSize(I) == 20 || getLoadStoreSize(I) == 32) { - FitsImmediateRange = checkImmediateRange<8, 4>(Offset); + FitsImmediateRange = checkImmediateRange<6, 4>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::LDA_dms_lda_idx_imm : AIE2::LDA_dms_lda_idx; return {ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; @@ -3745,7 +3746,7 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( if (getLoadStoreSize(I) == 256 || getLoadStoreSize(I) == 512) { unsigned RBID = deriveRegBankID(I.getOperand(0).getReg(), MRI, RBI); if (RBID == AIE2::AccRegBankID) { - FitsImmediateRange = checkImmediateRange<12, 32>(Offset); + FitsImmediateRange = checkImmediateRange<7, 32>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_dmw_lda_am_ag_pstm_nrm_imm : AIE2::VLDA_dmw_lda_am_ag_pstm_nrm; return {ISelOpcode, FitsImmediateRange, @@ -3754,11 +3755,11 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( if (RBID == AIE2::VRegBankID) { // First try if the Instruction can be selected as multi-slot offset // load - if (checkImmediateRange<9, 32>(Offset)) { + if (checkImmediateRange<4, 32>(Offset)) { FitsImmediateRange = true; ISelOpcode = AIE2::VLD_pstm_imm_4x32_pseudo; } else { - FitsImmediateRange = checkImmediateRange<12, 32>(Offset); + FitsImmediateRange = checkImmediateRange<7, 32>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_dmw_lda_w_ag_pstm_nrm_imm : AIE2::VLD_pstm_pseudo; } @@ -3776,7 +3777,7 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( * between the available LDA_dmv_lda_q_ag_pstm_nrm which has * 128-bit destination operand vs VLDB_dmv_ldb_ag_pstm_nrm which * has 256-bit destination operand. */ - FitsImmediateRange = checkImmediateRange<11, 16>(Offset); + FitsImmediateRange = checkImmediateRange<7, 16>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::LDA_dmv_lda_q_ag_pstm_nrm_imm : AIE2::VLDB_128_ag_pstm_nrm; return {ISelOpcode, FitsImmediateRange, @@ -3785,7 +3786,7 @@ LoadStoreOpcodes AIE2InstructionSelector::getLoadStoreOpcode( llvm_unreachable("Vector type not in VRegBank"); } if (getLoadStoreSize(I) == 20 || getLoadStoreSize(I) == 32) { - FitsImmediateRange = checkImmediateRange<9, 4>(Offset); + FitsImmediateRange = checkImmediateRange<7, 4>(Offset); ISelOpcode = FitsImmediateRange ? AIE2::LDA_dms_lda_pstm_nrm_imm : AIE2::LDA_dms_lda_pstm_nrm; return {ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; @@ -3856,13 +3857,13 @@ getCombinedOpcodePACK(const MachineInstr &MemOp, const MachineInstr &CombOp, break; case AIE2::G_AIE_OFFSET_STORE: if (Is32Lanes) { - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_PACK_S8_S16_ag_idx_imm : AIE2::VST_PACK_S8_S16_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; } else { - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_PACK_S4_S8_ag_idx_imm : AIE2::VST_PACK_S4_S8_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, @@ -3870,7 +3871,7 @@ getCombinedOpcodePACK(const MachineInstr &MemOp, const MachineInstr &CombOp, } break; case AIE2::G_AIE_POSTINC_STORE: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); if (Is32Lanes) { ISelOpcode = FitsImmediateRange ? AIE2::VST_PACK_S8_S16_ag_pstm_nrm_imm : AIE2::VST_PACK_S8_S16_ag_pstm_nrm; @@ -3921,13 +3922,13 @@ getCombinedOpcodePACK(const MachineInstr &MemOp, const MachineInstr &CombOp, break; case AIE2::G_AIE_OFFSET_STORE: if (Is32Lanes) { - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_PACK_D8_D16_ag_idx_imm : AIE2::VST_PACK_D8_D16_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; } else { - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_PACK_D4_D8_ag_idx_imm : AIE2::VST_PACK_D4_D8_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, @@ -3935,7 +3936,7 @@ getCombinedOpcodePACK(const MachineInstr &MemOp, const MachineInstr &CombOp, } break; case AIE2::G_AIE_POSTINC_STORE: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); if (Is32Lanes) { ISelOpcode = FitsImmediateRange ? AIE2::VST_PACK_D8_D16_ag_pstm_nrm_imm : AIE2::VST_PACK_D8_D16_ag_pstm_nrm; @@ -4218,13 +4219,13 @@ getCombinedOpcodeCONV(const MachineInstr &MemOp, const MachineInstr &CombOp, return LoadStoreOpcodes{/*ISelOpcode=*/AIE2::VST_CONV_BF16_FP32_ag_idx_imm, AlwaysFitsImmediateRange, /*OffsetOpcode=*/{}}; case AIE2::G_AIE_OFFSET_STORE: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_CONV_BF16_FP32_ag_idx_imm : AIE2::VST_CONV_BF16_FP32_ag_idx; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, /*OffsetOpcode=*/{}}; case AIE2::G_AIE_POSTINC_STORE: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VST_CONV_BF16_FP32_ag_pstm_nrm_imm : AIE2::VST_CONV_BF16_FP32_ag_pstm_nrm; return LoadStoreOpcodes{ISelOpcode, FitsImmediateRange, @@ -4460,12 +4461,12 @@ static bool getVLDA_CONVOpcode(const MachineInstr &MemOp, FitsImmediateRange = true; return true; case AIE2::G_AIE_OFFSET_LOAD: - FitsImmediateRange = checkImmediateRange<8, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<3, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_CONV_FP32_BF16_ag_idx_imm : AIE2::VLDA_CONV_FP32_BF16_ag_idx; return true; case AIE2::G_AIE_POSTINC_LOAD: - FitsImmediateRange = checkImmediateRange<9, 32>(Immediate); + FitsImmediateRange = checkImmediateRange<4, 32>(Immediate); ISelOpcode = FitsImmediateRange ? AIE2::VLDA_CONV_FP32_BF16_pstm_nrm_imm : AIE2::VLDA_CONV_FP32_BF16_pstm_nrm; return true; From 61d0600cf7221a2b614c3837936b1670965a3b6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Mon, 12 Aug 2024 14:29:23 +0100 Subject: [PATCH 27/31] [AIEX] Support Final REG_SEQUENCE when combining INSERT-SUBREG chains E.g. %5:vec512 = REG_SEQUENCE %1, %subreg.sub_256_lo %6:vec512 = INSERT_SUBREG %5, %2, %subreg.sub_256_hi Combined into: %6:vec512 = REG_SEQUENCE %1, %subreg.sub_256_lo, %2, %subreg.sub_256_hi --- llvm/lib/Target/AIE/AIEPostSelectOptimize.cpp | 14 ++++++++++- .../aie2/GlobalISel/post-select-combine.mir | 25 +++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AIE/AIEPostSelectOptimize.cpp b/llvm/lib/Target/AIE/AIEPostSelectOptimize.cpp index 153124ae7794..306f6aece523 100644 --- a/llvm/lib/Target/AIE/AIEPostSelectOptimize.cpp +++ b/llvm/lib/Target/AIE/AIEPostSelectOptimize.cpp @@ -214,11 +214,23 @@ std::map collectSubregsChain(const MachineInstr &MI) { // E.g. %5:vec512 = INSERT_SUBREG %0, %1, %subreg.sub_256_hi // %6:vec512 = INSERT_SUBREG %5, %2, %subreg.sub_256_lo std::function Impl = [&](const MachineInstr &MI) { + if (MI.getOpcode() == TargetOpcode::REG_SEQUENCE) { + // The source of the INSERT_SUBREG chain is a REG_SEQUENCE. Collect all + // its subregs. + for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); OpIdx += 2) { + Subregs.try_emplace(MI.getOperand(OpIdx + 1).getImm(), + MI.getOperand(OpIdx).getReg()); + } + return; + } + + // Recursively follow INSERT_SUBREG chain assert(MI.getOpcode() == TargetOpcode::INSERT_SUBREG); Subregs.try_emplace(MI.getOperand(3).getImm(), MI.getOperand(2).getReg()); MachineInstr &SrcMI = *MRI.getVRegDef(MI.getOperand(1).getReg()); if (SrcMI.getParent() == MI.getParent() && - SrcMI.getOpcode() == TargetOpcode::INSERT_SUBREG) + (SrcMI.getOpcode() == TargetOpcode::INSERT_SUBREG || + SrcMI.getOpcode() == TargetOpcode::REG_SEQUENCE)) Impl(SrcMI); }; diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/post-select-combine.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/post-select-combine.mir index cc85be01af6d..affe55b58a99 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/post-select-combine.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/post-select-combine.mir @@ -158,3 +158,28 @@ body: | $x4 = COPY %6 PseudoRET implicit $lr, implicit $x4 ... + + +--- +name: combine_256_with_reg_sequence +alignment: 16 +legalized: true +regBankSelected: true +selected: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $wl2, $wh2 + ; CHECK-LABEL: name: combine_256_with_reg_sequence + ; CHECK: liveins: $wl2, $wh2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ewl = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec256 = COPY $wh2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[COPY1]], %subreg.sub_256_hi, [[COPY]], %subreg.sub_256_lo + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE]] + %1:ewl = COPY $wl2 + %2:vec256 = COPY $wh2 + %5:vec512 = REG_SEQUENCE %1, %subreg.sub_256_lo + %6:vec512 = INSERT_SUBREG %5, %2, %subreg.sub_256_hi + PseudoRET implicit $lr, implicit %6 +... From 8ba76909564aec0d3ce7159553cb2fbe4bed4399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Mon, 12 Aug 2024 14:49:19 +0100 Subject: [PATCH 28/31] [AIEX] Combine INSERT_SUBREG chains for more reg classes. Specifically: ACC512, VEC1024, ACC1024 --- llvm/lib/Target/AIE/AIE2RegisterInfo.cpp | 8 +- .../aie2/GlobalISel/post-select-combine.mir | 99 +++++++++++++++++++ 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AIE/AIE2RegisterInfo.cpp b/llvm/lib/Target/AIE/AIE2RegisterInfo.cpp index ce495304e1e6..15d6aa30e8e2 100644 --- a/llvm/lib/Target/AIE/AIE2RegisterInfo.cpp +++ b/llvm/lib/Target/AIE/AIE2RegisterInfo.cpp @@ -445,9 +445,15 @@ SmallSet AIE2RegisterInfo::getCoveringSubRegs(const TargetRegisterClass &RC) const { // TODO: This could be generated from TableGen by looking at MCRegisters. SmallSet Subregs; - if (AIE2::VEC512RegClass.hasSubClassEq(&RC)) { + if (AIE2::VEC512RegClass.hasSubClassEq(&RC) || + AIE2::ACC512RegClass.hasSubClassEq(&RC)) { Subregs.insert(AIE2::sub_256_lo); Subregs.insert(AIE2::sub_256_hi); } + if (AIE2::VEC1024RegClass.hasSubClassEq(&RC) || + AIE2::ACC1024RegClass.hasSubClassEq(&RC)) { + Subregs.insert(AIE2::sub_512_lo); + Subregs.insert(AIE2::sub_512_hi); + } return Subregs; } diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/post-select-combine.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/post-select-combine.mir index affe55b58a99..782541661ea5 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/post-select-combine.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/post-select-combine.mir @@ -183,3 +183,102 @@ body: | %6:vec512 = INSERT_SUBREG %5, %2, %subreg.sub_256_hi PseudoRET implicit $lr, implicit %6 ... + +--- +name: combine_256_acc +alignment: 16 +legalized: true +regBankSelected: true +selected: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $bml0, $amll2, $amlh2 + ; CHECK-LABEL: name: combine_256_acc + ; CHECK: liveins: $bml0, $amll2, $amlh2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:acc256 = COPY $amll2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc256 = COPY $amlh2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc512 = REG_SEQUENCE [[COPY1]], %subreg.sub_256_hi, [[COPY]], %subreg.sub_256_lo + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE]] + %0:acc512 = COPY $bml0 + %1:acc256 = COPY $amll2 + %5:acc512 = INSERT_SUBREG %0, %1, %subreg.sub_256_lo + %2:acc256 = COPY $amlh2 + %6:acc512 = INSERT_SUBREG %5, %2, %subreg.sub_256_hi + PseudoRET implicit $lr, implicit %6 +... + +--- +name: combine_512_vec +alignment: 16 +legalized: true +regBankSelected: true +selected: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $y4, $x4, $x5 + ; CHECK-LABEL: name: combine_512_vec + ; CHECK: liveins: $y4, $x4, $x5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:exe = COPY $x4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = COPY $x5 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec1024 = REG_SEQUENCE [[COPY1]], %subreg.sub_512_hi, [[COPY]], %subreg.sub_512_lo + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE]] + %0:vec1024 = COPY $y4 + %5:exe = COPY $x4 + %6:vec1024 = INSERT_SUBREG %0, %5, %subreg.sub_512_lo + %10:vec512 = COPY $x5 + %11:vec1024 = INSERT_SUBREG %6, %10, %subreg.sub_512_hi + PseudoRET implicit $lr, implicit %11 +... + +--- +name: combine_512_acc +alignment: 16 +legalized: true +regBankSelected: true +selected: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $cm4, $bml2, $bmh2 + ; CHECK-LABEL: name: combine_512_acc + ; CHECK: liveins: $cm4, $bml2, $bmh2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ebml = COPY $bml2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc512 = COPY $bmh2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc1024 = REG_SEQUENCE [[COPY1]], %subreg.sub_512_hi, [[COPY]], %subreg.sub_512_lo + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE]] + %0:acc1024 = COPY $cm4 + %5:ebml = COPY $bml2 + %6:acc1024 = INSERT_SUBREG %0, %5, %subreg.sub_512_lo + %10:acc512 = COPY $bmh2 + %11:acc1024 = INSERT_SUBREG %6, %10, %subreg.sub_512_hi + PseudoRET implicit $lr, implicit %11 +... + +--- +name: combine_512_acc_with_reg_sequence +alignment: 16 +legalized: true +regBankSelected: true +selected: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $bml2, $bmh2 + ; CHECK-LABEL: name: combine_512_acc_with_reg_sequence + ; CHECK: liveins: $bml2, $bmh2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ebml = COPY $bml2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:acc512 = COPY $bmh2 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:acc1024 = REG_SEQUENCE [[COPY1]], %subreg.sub_512_hi, [[COPY]], %subreg.sub_512_lo + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[REG_SEQUENCE]] + %5:ebml = COPY $bml2 + %6:acc1024 = REG_SEQUENCE %5, %subreg.sub_512_lo + %10:acc512 = COPY $bmh2 + %11:acc1024 = INSERT_SUBREG %6, %10, %subreg.sub_512_hi + PseudoRET implicit $lr, implicit %11 +... From 40624888fc2dabc8bdd5740b5e16611214f0e6da Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Tue, 20 Aug 2024 01:23:36 -0500 Subject: [PATCH 29/31] [WIP] Support windows wheels (#155) --- .github/workflows/amd-aie-distro.yml | 4 +++- .../amd_aie_releases/patches/mscv.patch | 12 ---------- .../amd_aie_releases/scripts/apply_patches.sh | 1 - .github/workflows/amd_aie_releases/setup.py | 24 ++++++++++++------- .../caches/Peano-AIE-runtime-libraries.cmake | 1 + 5 files changed, 19 insertions(+), 23 deletions(-) delete mode 100644 .github/workflows/amd_aie_releases/patches/mscv.patch diff --git a/.github/workflows/amd-aie-distro.yml b/.github/workflows/amd-aie-distro.yml index 9fe8741f69d0..9cd4c1a6d3f5 100644 --- a/.github/workflows/amd-aie-distro.yml +++ b/.github/workflows/amd-aie-distro.yml @@ -154,6 +154,8 @@ jobs: include: - OS: ubuntu-20.04 ARCH: x86_64 + - OS: windows-2019 + ARCH: AMD64 defaults: run: @@ -211,7 +213,7 @@ jobs: LLVM_AIE_PROJECT_COMMIT=${{ needs.settings.outputs.LLVM_AIE_PROJECT_COMMIT }} gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" \ repos/Xilinx/llvm-aie/zipball/$LLVM_AIE_PROJECT_COMMIT > llvm-aie.zip - unzip -q llvm-aie.zip + unzip -qq llvm-aie.zip rm -rf llvm-aie.zip mv Xilinx-llvm-aie-* llvm-aie diff --git a/.github/workflows/amd_aie_releases/patches/mscv.patch b/.github/workflows/amd_aie_releases/patches/mscv.patch deleted file mode 100644 index eaf3b8d9592c..000000000000 --- a/.github/workflows/amd_aie_releases/patches/mscv.patch +++ /dev/null @@ -1,12 +0,0 @@ -diff --git a/cmake/Modules/CMakePolicy.cmake b/cmake/Modules/CMakePolicy.cmake ---- a/cmake/Modules/CMakePolicy.cmake (revision be23604c17dd54318f9431bb107b4122c467ea47) -+++ b/cmake/Modules/CMakePolicy.cmake (date 1714517198548) -@@ -10,3 +10,7 @@ - if(POLICY CMP0116) - cmake_policy(SET CMP0116 OLD) - endif() -+ -+if(POLICY CMP0091) -+ cmake_policy(SET CMP0091 NEW) -+endif() -\ No newline at end of file diff --git a/.github/workflows/amd_aie_releases/scripts/apply_patches.sh b/.github/workflows/amd_aie_releases/scripts/apply_patches.sh index 436c4b6e68dd..9985d644c205 100755 --- a/.github/workflows/amd_aie_releases/scripts/apply_patches.sh +++ b/.github/workflows/amd_aie_releases/scripts/apply_patches.sh @@ -3,7 +3,6 @@ set -uxo pipefail # note that space before slash is important PATCHES="\ -mscv \ " if [[ x"${APPLY_PATCHES:-true}" == x"true" ]]; then diff --git a/.github/workflows/amd_aie_releases/setup.py b/.github/workflows/amd_aie_releases/setup.py index f473c070685b..c46cf770dcbc 100644 --- a/.github/workflows/amd_aie_releases/setup.py +++ b/.github/workflows/amd_aie_releases/setup.py @@ -1,11 +1,11 @@ -import shutil -from datetime import datetime import os import platform import re +import shutil import subprocess import sys -from pathlib import Path, PureWindowsPath +from datetime import datetime +from pathlib import Path from pprint import pprint from setuptools import Extension, setup @@ -49,8 +49,6 @@ def build_extension(self, ext: CMakeExtension) -> None: cmake_args = [ f"-B{build_temp}", f"-G {cmake_generator}", - "-DLLVM_BUILD_LLVM_DYLIB=ON", - "-DLLVM_LINK_LLVM_DYLIB=ON", "-DLLVM_BUILD_BENCHMARKS=OFF", "-DLLVM_BUILD_EXAMPLES=OFF", f"-DLLVM_BUILD_TESTS={RUN_TESTS}", @@ -75,10 +73,14 @@ def build_extension(self, ext: CMakeExtension) -> None: f"-DCMAKE_INSTALL_PREFIX={install_dir}", f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}", f"-DPython3_EXECUTABLE={PYTHON_EXECUTABLE}", - f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm + f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm, ] - - if platform.system() == "Windows": + if platform.system() != "Windows": + cmake_args += [ + "-DLLVM_BUILD_LLVM_DYLIB=ON", + "-DLLVM_LINK_LLVM_DYLIB=ON", + ] + else: cmake_args += [ "-DCMAKE_C_COMPILER=cl", "-DCMAKE_CXX_COMPILER=cl", @@ -87,6 +89,7 @@ def build_extension(self, ext: CMakeExtension) -> None: "-DCMAKE_CXX_FLAGS=/MT", "-DLLVM_USE_CRT_MINSIZEREL=MT", "-DLLVM_USE_CRT_RELEASE=MT", + "-DCMAKE_POLICY_DEFAULT_CMP0091=NEW", ] if "CMAKE_ARGS" in os.environ: @@ -204,7 +207,10 @@ def build_extension(self, ext: CMakeExtension) -> None: os.getenv("LLVM_AIE_SRC_ROOT", Path.cwd() / "llvm-aie") ).absolute() -cmake_txt = open(LLVM_AIE_SRC_ROOT / "llvm" / "CMakeLists.txt").read() +cmake_version_path = Path("llvm-aie/cmake/Modules/LLVMVersion.cmake") +if not cmake_version_path.exists(): + cmake_version_path = Path("llvm-aie/llvm/CMakeLists.txt") +cmake_txt = open(cmake_version_path).read() llvm_version = [] for v in ["LLVM_VERSION_MAJOR", "LLVM_VERSION_MINOR", "LLVM_VERSION_PATCH"]: vn = re.findall(rf"set\({v} (\d+)\)", cmake_txt) diff --git a/clang/cmake/caches/Peano-AIE-runtime-libraries.cmake b/clang/cmake/caches/Peano-AIE-runtime-libraries.cmake index 47a8adb96151..88b4cd78e67e 100644 --- a/clang/cmake/caches/Peano-AIE-runtime-libraries.cmake +++ b/clang/cmake/caches/Peano-AIE-runtime-libraries.cmake @@ -13,6 +13,7 @@ set(LLVM_ENABLE_RUNTIMES libc CACHE STRING "") set(LLVM_LIBC_FULL_BUILD ON CACHE BOOL "") +set(LLVM_FORCE_BUILD_RUNTIME "libc" CACHE STRING "") set(LLVM_BUILTIN_TARGETS "aie-none-unknown-elf;aie2-none-unknown-elf" CACHE STRING "") set(LLVM_RUNTIME_TARGETS "${LLVM_BUILTIN_TARGETS}" CACHE STRING "") From 5371806892b0eb4320f53187274acbb9d7885ce2 Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Tue, 20 Aug 2024 10:24:14 +0100 Subject: [PATCH 30/31] [AIE2] Add HasTiedSubregister to VLD_*D pseudo This avoid some machine-cp undesired changes. --- llvm/lib/Target/AIE/AIE2MultiSlotPseudoInstrInfo.td | 4 ++-- llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-pass-vld.mir | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AIE/AIE2MultiSlotPseudoInstrInfo.td b/llvm/lib/Target/AIE/AIE2MultiSlotPseudoInstrInfo.td index 4092e75aac77..46853c49b5cb 100644 --- a/llvm/lib/Target/AIE/AIE2MultiSlotPseudoInstrInfo.td +++ b/llvm/lib/Target/AIE/AIE2MultiSlotPseudoInstrInfo.td @@ -92,13 +92,13 @@ let hasSideEffects = false, mayLoad = true, mayStore = false in { def VLD_2D_pseudo : MultiSlot_Pseudo< (outs mWa:$dst, eP:$ptr_out, eDC:$count_out), (ins eP:$ptr, eD:$mod), "vld.2d_pseudo", "$dst, [$ptr], $mod", - [VLDB_2D, VLDA_2D_dmw_lda_w]>; + [VLDB_2D, VLDA_2D_dmw_lda_w]>, AIE_HasTiedSubregister; } let Itinerary = II_VLDA_3D_W in let Constraints = "$ptr_out = $ptr" in { def VLD_3D_pseudo : MultiSlot_Pseudo< (outs mWa:$dst, eP:$ptr_out, eDC:$count_lo_out, eDC:$count_hi_out), (ins eP:$ptr, eDS:$mod), "vld.3d_pseudo", "$dst, [$ptr], $mod", - [VLDB_3D, VLDA_3D_dmw_lda_w]>; + [VLDB_3D, VLDA_3D_dmw_lda_w]>, AIE_HasTiedSubregister; } } diff --git a/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-pass-vld.mir b/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-pass-vld.mir index d318742f8e32..bca2e8a59fc7 100644 --- a/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-pass-vld.mir +++ b/llvm/test/CodeGen/AIE/aie2/ra/tie-subregs-pass-vld.mir @@ -61,13 +61,14 @@ body: | ; CHECK-NEXT: $x3, %36:ep, [[COPY20]].sub_dim_count:ed = VLDB_2D_UNPACK_D8_D4 [[COPY4]], [[COPY20]], implicit $crunpacksign ; CHECK-NEXT: [[COPY21:%[0-9]+]]:ed = COPY [[REG_SEQUENCE]] ; CHECK-NEXT: $x6, %38:ep, [[COPY21]].sub_dim_count:ed = VLDB_2D_UNPACK_D16_D8 [[COPY4]], [[COPY21]], implicit $crunpacksign + ; CHECK-NEXT: [[COPY22:%[0-9]+]]:ed = COPY [[REG_SEQUENCE]] + ; CHECK-NEXT: $wl0, %40:ep, [[COPY22]].sub_dim_count:ed = VLD_2D_pseudo [[COPY4]], [[COPY22]] %0:em = COPY $r0 %1:edn = COPY $r1 %2:edj = COPY $r2 %3:edc = COPY $r3 %4:ep = COPY $p0 - ; ISel code for add_2d_byte %10:ed = REG_SEQUENCE %0, %subreg.sub_mod, %1, %subreg.sub_dim_size, %2, %subreg.sub_dim_stride, %3, %subreg.sub_dim_count $wl0, %20:ep, %21:edc = VLDA_2D_dmw_lda_w %4, %10 $amll0, %30:ep, %31:edc = VLDA_2D_dmw_lda_am %4, %10 @@ -86,6 +87,7 @@ body: | $x1, %160:ep, %161:edc = VLDB_2D_UNPACK_S16_S8 %4, %10 $x3, %170:ep, %171:edc = VLDB_2D_UNPACK_D8_D4 %4, %10, implicit $crunpacksign $x6, %180:ep, %181:edc = VLDB_2D_UNPACK_D16_D8 %4, %10, implicit $crunpacksign + $wl0, %190:ep, %191:edc = VLD_2D_pseudo %4, %10 ... --- @@ -137,6 +139,8 @@ body: | ; CHECK-NEXT: $x3, %47:ep, [[COPY17]].sub_dim_count:eds, [[COPY17]].sub_hi_dim_then_sub_dim_count:eds = VLDB_3D_UNPACK_D8_D4 [[COPY]], [[COPY17]], implicit $crunpacksign ; CHECK-NEXT: [[COPY18:%[0-9]+]]:eds = COPY [[COPY1]] ; CHECK-NEXT: $x6, %50:ep, [[COPY18]].sub_dim_count:eds, [[COPY18]].sub_hi_dim_then_sub_dim_count:eds = VLDB_3D_UNPACK_D16_D8 [[COPY]], [[COPY18]], implicit $crunpacksign + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:eds = COPY [[COPY1]] + ; CHECK-NEXT: $wl0, %53:ep, [[COPY19]].sub_dim_count:eds, [[COPY19]].sub_hi_dim_then_sub_dim_count:eds = VLD_3D_pseudo [[COPY]], [[COPY19]] %0:ep = COPY $p0 %10:eds = COPY $d0_3d $wl0, %20:ep, %21:edc, %22:edc = VLDA_3D_dmw_lda_w %0, %10 @@ -156,4 +160,5 @@ body: | $x1, %160:ep, %161:edc, %162:edc = VLDB_3D_UNPACK_S16_S8 %0, %10 $x3, %170:ep, %171:edc, %172:edc = VLDB_3D_UNPACK_D8_D4 %0, %10, implicit $crunpacksign $x6, %180:ep, %181:edc, %182:edc = VLDB_3D_UNPACK_D16_D8 %0, %10, implicit $crunpacksign + $wl0, %190:ep, %191:edc, %192:edc = VLD_3D_pseudo %0, %10 ... From 1dfbea0fcaed57f3b8ecd0fcb62641e7b2b495c9 Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Wed, 31 Jul 2024 08:20:58 +0100 Subject: [PATCH 31/31] [AIE] Add missing Copyright info --- llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp | 2 ++ llvm/lib/Target/AIE/AIEInterBlockScheduling.h | 2 ++ llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp | 3 +++ llvm/lib/Target/AIE/AIEMaxLatencyFinder.h | 2 ++ 4 files changed, 9 insertions(+) diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index d185c47120db..4e54f0abccde 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -4,6 +4,8 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// //===----------------------------------------------------------------------===// // Implementations of the classes used to support inter-block scheduling //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h index 8fb452a656b3..ba4526447a86 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h @@ -4,6 +4,8 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// //===----------------------------------------------------------------------===// // // Class providing services for interblock scheduling. diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp index 433c8a4780fa..970893959fd1 100644 --- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp +++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp @@ -3,6 +3,9 @@ // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// //===----------------------------------------------------------------------===// // // This file implements the interblock latency utilities diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h index 2b5da5b7177c..4045f91ca538 100644 --- a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h +++ b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.h @@ -4,6 +4,8 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// //===----------------------------------------------------------------------===// // // This file declares helpers for inter-block latency computations