diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp b/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp index b58f1ab163f7..40099b9b7a35 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp +++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp @@ -53,8 +53,10 @@ namespace { const constexpr unsigned NumDelaySlots = 5; } // namespace -unsigned AIEBaseInstrInfo::getNumDelaySlots(const MachineInstr &MI) const { - return MI.hasDelaySlot() ? NumDelaySlots : 0; +unsigned +AIEBaseInstrInfo::getNumDelaySlots(const MachineInstr &MI, + MachineInstr::QueryType Query) const { + return MI.hasDelaySlot(Query) ? NumDelaySlots : 0; } unsigned diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h index 35d452c951ed..7632602f596f 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h +++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h @@ -77,7 +77,10 @@ struct AIEBaseInstrInfo : public TargetInstrInfo { /// Returns the number of delay slots that this instruction requires. /// This might be 0 - virtual unsigned getNumDelaySlots(const MachineInstr &MI) const; + virtual unsigned + getNumDelaySlots(const MachineInstr &MI, + MachineInstr::QueryType Query = + MachineInstr::QueryType::AnyInBundle) const; /// Returns the number of delay slots that should be reserved, i.e. /// not filled in by the scheduler. diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp index f8768e6575b0..217b5aad44ce 100644 --- a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp +++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp @@ -210,7 +210,7 @@ class RegionEndEdges : public ScheduleDAGMutation { } } void apply(ScheduleDAGInstrs *DAG) override { - AIE::MaxLatencyFinder MaxLatency(static_cast(DAG)); + AIE::MaxLatencyFinder MaxLatency(DAG); // Default edges to ExitSU are conservative, and can't be shrunk. // We really should know what we're doing here, so just remove and @@ -224,7 +224,12 @@ class RegionEndEdges : public ScheduleDAGMutation { SDep ExitDep(&SU, SDep::Artificial); - unsigned DelaySlots = TII->getNumDelaySlots(MI); + // By using IgnoreBundle, we can safely apply this mutation to already + // bundled instructions without causing misclassification of instructions + // that are bundled with control flow ones. Otherwise, the assertion + // below can be triggered for correct cases. + unsigned DelaySlots = + TII->getNumDelaySlots(MI, MachineInstr::QueryType::IgnoreBundle); unsigned EdgeLatency = !DelaySlots && UserSetLatencyMargin ? UserLatencyMargin : MaxLatency(MI); @@ -355,12 +360,15 @@ class PropagateIncomingLatencies : public ScheduleDAGMutation { class MemoryEdges : public ScheduleDAGMutation { void apply(ScheduleDAGInstrs *DAG) override { const auto *TII = static_cast(DAG->TII); - + // Query individual instruction behavior. This is because we might create + // dependencies with already-scheduled blocks where Bundles have been + // created. + const auto QueryType = MachineInstr::QueryType::IgnoreBundle; // Run over all instructions that may load or store, and correct the // latencies for all their memory dependencies. for (SUnit &SU : DAG->SUnits) { MachineInstr &MI = *SU.getInstr(); - if (!MI.mayLoadOrStore()) { + if (!MI.mayLoadOrStore(QueryType)) { continue; } @@ -369,13 +377,14 @@ class MemoryEdges : public ScheduleDAGMutation { // Ignore non-memory dependencies. Locks or other instructions with side // effects aren't handled with MemInstrItinData itineraries. - if (!PredEdge.isNormalMemoryOrBarrier() || !SrcMI.mayLoadOrStore()) { + if (!PredEdge.isNormalMemoryOrBarrier() || + !SrcMI.mayLoadOrStore(QueryType)) { continue; } // Ignore Load-Load (RAR) dependencies. // TODO: Those should probably be removed altogether. - if (!SrcMI.mayStore() && !MI.mayStore()) { + if (!SrcMI.mayStore(QueryType) && !MI.mayStore(QueryType)) { continue; } diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index 3baabf0174c6..6608dfd12197 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -32,8 +32,58 @@ static cl::opt LoopAware("aie-loop-aware", cl::init(true), cl::desc("[AIE] Schedule single block loops iteratively")); +static cl::opt LoopEpilogueAnalysis( + "aie-loop-epilogue-analysis", cl::init(true), + cl::desc("[AIE] Perform Loop/Epilogue analysis with loop scheduling")); + namespace llvm::AIE { +void dumpInterBlock(const InterBlockEdges &Edges) { + for (const SUnit &SU : Edges) { + dbgs() << "SU" << SU.NodeNum << ": " << *SU.getInstr(); + } +} + +void emitBundlesInScoreboard(const std::vector &Bundles, + ResourceScoreboard &Scoreboard, + AIEHazardRecognizer *HR) { + + const int TotalBundles = Bundles.size(); + const int AmountToEmit = std::min(TotalBundles, HR->getConflictHorizon()); + // Do not emit more than the specified by the conflict horizon. More + // then this will not cause conflicts. + for (int i = TotalBundles - AmountToEmit; i < TotalBundles; i++) { + for (MachineInstr *MI : Bundles[i].getInstrs()) + HR->emitInScoreboard(Scoreboard, MI->getDesc(), 0); + + Scoreboard.advance(); + } +} + +void emitBundlesInScoreboardDelta( + const std::vector &Bundles, + ResourceScoreboard &Scoreboard, int &Delta, + AIEHazardRecognizer *HR) { + + for (auto &Bundle : Bundles) { + // We don't need to replay more instructions, because we exhausted the + // scoreboard. + if (Delta >= 0) + break; + + for (MachineInstr *MI : Bundle.getInstrs()) + HR->emitInScoreboard(Scoreboard, MI->getDesc(), Delta); + + Delta++; + } +} + +MachineBasicBlock *getSinglePredecessor(const MachineBasicBlock &MBB) { + assert(MBB.pred_size() == 1 && "MBB contains more than 1 predecessor"); + MachineBasicBlock *SinglePredMBB = *MBB.predecessors().begin(); + return SinglePredMBB; +} + InterBlockScheduling::InterBlockScheduling(const MachineSchedContext *C, bool InterBlock) : Context(C), InterBlockScoreboard(InterBlock) {} @@ -118,12 +168,8 @@ bool InterBlockScheduling::resourcesConverged(BlockState &BS) const { ResourceScoreboard Bottom; Bottom.reset(Depth); - for (auto &Bundle : BS.getBottom().Bundles) { - for (MachineInstr *MI : Bundle.getInstrs()) { - HR->emitInScoreboard(Bottom, MI->getDesc(), 0); - } - Bottom.advance(); - } + emitBundlesInScoreboard(BS.getBottom().Bundles, Bottom, HR.get()); + DEBUG_LOOPAWARE(dbgs() << "Bottom scoreboard\n"; Bottom.dump()); // We have two successors, the loop itself and the epilogue assert(BS.TheBlock->succ_size() == 2); @@ -138,15 +184,9 @@ bool InterBlockScheduling::resourcesConverged(BlockState &BS) const { ResourceScoreboard Top; Top.reset(Depth); int Cycle = -Depth; - for (auto &Bundle : BS.getBottom().Bundles) { - if (Cycle >= 0) { - break; - } - for (MachineInstr *MI : Bundle.getInstrs()) { - HR->emitInScoreboard(Top, MI->getDesc(), Cycle); - } - Cycle++; - } + + emitBundlesInScoreboardDelta(BS.getBottom().Bundles, Top, Cycle, HR.get()); + DEBUG_LOOPAWARE(dbgs() << "Top scoreboard\n"; Top.dump()); if (Bottom.conflict(Top, Depth)) { return false; @@ -369,14 +409,152 @@ int InterBlockScheduling::getNumEntryNops(const BlockState &BS) const { } const MachineBasicBlock &BB = *BS.TheBlock; assert(BB.pred_size() == 1); - MachineBasicBlock *Loop = *BB.predecessors().begin(); + MachineBasicBlock *Loop = getSinglePredecessor(BB); auto &LBS = getBlockState(Loop); - // TODO: we can do better by doing full interblock analysis - // between BS and LBS + // We can only analyze non-empty epilogue blocks because we need + // to build a DDG, which is not possible. + // For empty ones, we need to be conservative because we are not aware of + // content of epilogues' successor. + if (LoopEpilogueAnalysis && BB.size() > 0) { + int ExistingLatency = getCyclesToRespectTiming(BS, LBS); + // Start the next step only after clearing latencies. + return getCyclesToAvoidResourceConflicts(ExistingLatency, BS, LBS); + } + return LBS.getSafetyMargin(); } +int InterBlockScheduling::getCyclesToRespectTiming( + const BlockState &EpilogueBS, const BlockState &LoopBS) const { + + const MachineBasicBlock &EpilogueMBB = *EpilogueBS.TheBlock; + const MachineBasicBlock *LoopMBB = getSinglePredecessor(EpilogueMBB); + + DEBUG_LOOPAWARE(dbgs() << "** Loop/Epilogue-carried latency dependencies:" + << " Original Loop " << *LoopMBB + << " Original Epilogue " << EpilogueMBB << "\n"); + + InterBlockEdges Edges(*Context); + std::map DistancesFromLoopEntry; + int DistFromLoopEntry = 0; + int EntryNops = 0; + + auto AddRegionToEdges = [&](const Region &R) { + for (auto &Bundle : R.Bundles) { + for (MachineInstr *MI : Bundle.getInstrs()) { + DistancesFromLoopEntry[MI] = DistFromLoopEntry; + Edges.addNode(MI); + } + ++DistFromLoopEntry; + } + }; + + // Construction of the superblock containing Loop+Epilogue + // First part is the loop + AddRegionToEdges(LoopBS.getBottom()); + Edges.markBoundary(); + // Second part is the epilogue itself + AddRegionToEdges(EpilogueBS.getTop()); + Edges.buildEdges(); + + DEBUG_LOOPAWARE(dumpInterBlock(Edges)); + // Check cross-boundary latencies. + int Height = 1; + for (auto &Bundle : reverse(LoopBS.getBottom().Bundles)) { + for (auto *PreBoundaryMI : Bundle.getInstrs()) { + const SUnit *Pred = Edges.getPreBoundaryNode(PreBoundaryMI); + + for (auto &SDep : Pred->Succs) { + auto *Succ = SDep.getSUnit(); + + if (!Edges.isPostBoundaryNode(Succ)) + continue; + + const MachineInstr *PostBoundaryMI = Succ->getInstr(); + + const int PostBoundOrExitDist = + (PostBoundaryMI != nullptr) + ? DistancesFromLoopEntry[PostBoundaryMI] + // When getInstr returns nullptr, we reached + // ExitSU. We can consider the DistFromLoopEntry as + // depth of the ExitSU. + : DistFromLoopEntry; + + const int Latency = SDep.getSignedLatency(); + const int Distance = + PostBoundOrExitDist - DistancesFromLoopEntry[PreBoundaryMI]; + + DEBUG_LOOPAWARE(dbgs() << "Data dependency found:\n" + << " Loop instruction SU: " << *PreBoundaryMI); + DEBUG_LOOPAWARE(dbgs() << " Epilogue instruction: "; + if (PostBoundaryMI) PostBoundaryMI->dump(); + else dbgs() << "nullptr (ExitSU)";); + DEBUG_LOOPAWARE(dbgs() << "\n Latency: " << Latency + << "\n Distance: " << Distance << "\n"); + + EntryNops = std::max(EntryNops, Latency - Distance); + } + } + if (++Height > HR->getConflictHorizon()) { + break; + } + } + + return EntryNops; +} + +int InterBlockScheduling::getCyclesToAvoidResourceConflicts( + int ExistingLatency, const BlockState &EpilogueBS, + const BlockState &LoopBS) const { + + const MachineBasicBlock &EpilogueMBB = *EpilogueBS.TheBlock; + MachineBasicBlock *LoopMBB = LoopBS.TheBlock; + int Depth = HR->getMaxLookAhead(); + ResourceScoreboard Bottom; + Bottom.reset(Depth); + + DEBUG_LOOPAWARE(dbgs() << "* Loop/Epilogue-carried resource conflicts:" + << " Original Loop " << *LoopMBB << " Original Epilog " + << EpilogueMBB << "\n"); + + emitBundlesInScoreboard(LoopBS.getBottom().Bundles, Bottom, HR.get()); + + // We know how many latency cycles we need to respect, and we can advance + // the scoreboard to the first possible cycle that can accommodate another + // instruction and start the resource verification from this point, tracking + // the number of NOPS. + int NopCounter = 0; + for (NopCounter = 0; NopCounter < ExistingLatency; ++NopCounter) + Bottom.advance(); + + DEBUG_LOOPAWARE(dbgs() << "Loop scoreboard\n"; Bottom.dump()); + + ResourceScoreboard Top; + Top.reset(Depth); + int Cycle = -Depth; + + auto Bundles = EpilogueBS.getBottom().Bundles; + + emitBundlesInScoreboardDelta(EpilogueBS.getBottom().Bundles, Top, Cycle, + HR.get()); + + DEBUG_LOOPAWARE(dbgs() << "Epilogue scoreboard\n"; Top.dump()); + + // Use scoreboard comparison to calculate the number of nops + while (Bottom.conflict(Top, Depth)) { + Bottom.advance(); + NopCounter++; + } + + DEBUG_LOOPAWARE(dbgs() << "Resource conflict avoidance between" + << " loop: " << *LoopMBB + << " And epilogue: " << EpilogueMBB << " Requires " + << NopCounter << " Nops\n"); + + return NopCounter; +} + void InterBlockEdges::addNode(MachineInstr *MI) { if (auto Index = DDG.initSUnit(*MI)) { IndexMap &TheMap = Boundary ? SuccMap : PredMap; @@ -461,12 +639,6 @@ void BlockState::classify() { // construction. } -void dumpInterBlock(const InterBlockEdges &Edges) { - for (const SUnit &SU : Edges) { - dbgs() << "SU" << SU.NodeNum << ": " << *SU.getInstr(); - } -} - void BlockState::initInterBlock(const MachineSchedContext &Context) { BoundaryEdges = std::make_unique(Context); diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h index a0939667373a..996f0a5d0dcf 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h @@ -257,6 +257,17 @@ class InterBlockScheduling { /// returns true if converged bool updateFixPoint(BlockState &BS); + /// Calculate the number of cycles that are needed to respect + /// latencies related to the loop whose the epilogue is associated + int getCyclesToRespectTiming(const BlockState &EpilogueBS, + const BlockState &LoopBS) const; + + /// Calculate the number of cycles that are needed to avoid resource + /// conflicts between loop and epilogue + int getCyclesToAvoidResourceConflicts(int ExistingLatency, + const BlockState &EpilogueBS, + const BlockState &LoopBS) const; + BlockState *CurrentBlock = nullptr; public: diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll index 97333b72ce98..6ebf79d187f5 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll @@ -153,10 +153,6 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ASM-NEXT: add r1, r1, #33; vmac cm7, cm7, x5, x6, r4 // Delay Slot 1 ; ASM-NEXT: // %bb.3: // %outer.loop.latch ; ASM-NEXT: // in Loop: Header=BB0_1 Depth=1 -; ASM-NEXT: nopa ; nopb ; nopx -; ASM-NEXT: nop -; ASM-NEXT: nop -; ASM-NEXT: nop ; ASM-NEXT: vst.srs.s16.s32 bmh0, s2, [p3, #32] ; ASM-NEXT: vst.srs.s16.s32 bml0, s2, [p3], #64 ; ASM-NEXT: vst.srs.s16.s32 bmh1, s2, [p3, #32]; mov m2, r31 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/fixpoint.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/fixpoint.mir index 3e63fa109faf..a423c2c9fcc8 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/fixpoint.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/fixpoint.mir @@ -7,7 +7,9 @@ # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates # RUN: llc --mtriple=aie2 --run-pass=postmisched \ -# RUN: %s -o - | FileCheck %s +# RUN: -aie-loop-epilogue-analysis=false %s -o - | FileCheck %s +# RUN: llc --mtriple=aie2 --run-pass=postmisched \ +# RUN: %s -o - | FileCheck %s --check-prefix=CHECK-EP-ANALYSIS # Check that loop-aware scheduling converges properly # Note: Comments sometimes refer to cycles. The first cycle is cycle 0 @@ -47,6 +49,33 @@ body: | ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP ; CHECK-NEXT: DelayedSchedBarrier implicit killed $r0 + ; + ; CHECK-EP-ANALYSIS-LABEL: name: singleSuccLoopPreheader + ; CHECK-EP-ANALYSIS: bb.0: + ; CHECK-EP-ANALYSIS-NEXT: successors: %bb.1(0x80000000) + ; CHECK-EP-ANALYSIS-NEXT: {{ $}} + ; CHECK-EP-ANALYSIS-NEXT: {{ $}} + ; CHECK-EP-ANALYSIS-NEXT: bb.1: + ; CHECK-EP-ANALYSIS-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-EP-ANALYSIS-NEXT: liveins: $r0, $r1, $r2, $cm0, $cm1, $cm2, $r0 + ; CHECK-EP-ANALYSIS-NEXT: {{ $}} + ; CHECK-EP-ANALYSIS-NEXT: $r10 = OR $r0, $r1 + ; CHECK-EP-ANALYSIS-NEXT: $r3 = OR killed $r10, $r2 + ; CHECK-EP-ANALYSIS-NEXT: $cm3 = VMOV_mv_cm $cm1 { + ; CHECK-EP-ANALYSIS-NEXT: $cm2 = VADD $cm0, $cm1, killed $r3 + ; CHECK-EP-ANALYSIS-NEXT: } + ; CHECK-EP-ANALYSIS-NEXT: PseudoLoopEnd %bb.1, %bb.2 + ; CHECK-EP-ANALYSIS-NEXT: {{ $}} + ; CHECK-EP-ANALYSIS-NEXT: bb.2: + ; CHECK-EP-ANALYSIS-NEXT: liveins: $r0 + ; CHECK-EP-ANALYSIS-NEXT: {{ $}} + ; CHECK-EP-ANALYSIS-NEXT: RET implicit $lr + ; CHECK-EP-ANALYSIS-NEXT: NOP + ; CHECK-EP-ANALYSIS-NEXT: NOP + ; CHECK-EP-ANALYSIS-NEXT: NOP + ; CHECK-EP-ANALYSIS-NEXT: NOP + ; CHECK-EP-ANALYSIS-NEXT: NOP + ; CHECK-EP-ANALYSIS-NEXT: DelayedSchedBarrier implicit killed $r0 bb.0: successors: %bb.1 bb.1: diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/latency.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/latency.mir index 87117db25926..b5919e0a5ccb 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/latency.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/latency.mir @@ -9,8 +9,7 @@ # RUN: llc --mtriple=aie2 --issue-limit=6 --run-pass=postmisched \ # RUN: --aie-interblock-scoreboard=false %s -o - | FileCheck %s -# Check that inter-block latencies are respected. We disable the interblock -# scoreboard from the commandline to make the test more sensitive +# Check that inter-block latencies are respected. # Note: Comments sometimes refer to cycles. The first cycle is cycle 0 @@ -100,7 +99,6 @@ body: | ; CHECK-NEXT: bb.2: ; CHECK-NEXT: liveins: $r1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: NOP ; CHECK-NEXT: RET implicit $lr ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP @@ -159,12 +157,6 @@ body: | ; CHECK-NEXT: bb.2: ; CHECK-NEXT: liveins: $r3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: NOP - ; CHECK-NEXT: NOP - ; CHECK-NEXT: NOP - ; CHECK-NEXT: NOP - ; CHECK-NEXT: NOP - ; CHECK-NEXT: NOP ; CHECK-NEXT: RET implicit $lr ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-epilogue-shift.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-epilogue-shift.mir new file mode 100644 index 000000000000..c555e4403678 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-epilogue-shift.mir @@ -0,0 +1,293 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py + +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 --issue-limit=6 --run-pass=postmisched \ +# RUN: --aie-interblock-scoreboard=true %s -o - \ +# RUN: | FileCheck %s + +# Check that inter-block latencies are respected by the loop-aware in +# loop/epilogue analysis. + +# WARNING: this set of tests relies on not breaking WAW dependencies, if this +# set fails because after some WAW dependency break mutation, please consider +# the inclusion of an option to disable such mutation to keep the correct +# semantic of the tests. + + +# This set of tests has as its goal testing the interaction between latency +# and resource conflict analysis. In this, way an operand latency induced by +# the loop on the epilogue can cause a shift on the epilogue's scoreboard +# with a side effect of causing resource conflicts that would not be present +# without the operand latency. + +# Test 1: the goal here be a baseline test, where we do not need any +# additional nops in the epilogue, because all latencies are covered +# and we do not have resource conflicts. Note that MOVA_lda_cg and +# LDA_dms_lda_idx share the same write port WX, but in a safe distance. + +--- +name: noNopsBaseline +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: noNopsBaseline + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $p0, $r0, $r1, $dj0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: JNZ $r0, %bb.1 + ; CHECK-NEXT: $p0 = PADDA_lda_ptr_inc_idx_imm killed $p0, 4 + ; CHECK-NEXT: $p0 = PADDA_lda_ptr_inc_idx_imm killed $p0, 4 + ; CHECK-NEXT: $p0 = PADDA_lda_ptr_inc_idx_imm killed $p0, 4 + ; CHECK-NEXT: $p0 = PADDA_lda_ptr_inc_idx_imm killed $p0, 4 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx $p0, $dj0 + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: $r1 = MOVA_lda_cg 1023 + ; CHECK-NEXT: $r1 = MOVA_lda_cg 1023 + ; CHECK-NEXT: $r1 = MOVA_lda_cg 1023 + ; CHECK-NEXT: $r1 = MOVA_lda_cg 1023 + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: liveins: $r3, $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier implicit killed $r3 + bb.0: + successors: %bb.1 + liveins: $p0 + bb.1: + liveins: $p0, $r0, $r1, $dj0 + successors: %bb.1, %bb.2 + $p0 = PADDA_lda_ptr_inc_idx_imm $p0, 4 + $p0 = PADDA_lda_ptr_inc_idx_imm $p0, 4 + $p0 = PADDA_lda_ptr_inc_idx_imm $p0, 4 + $p0 = PADDA_lda_ptr_inc_idx_imm $p0, 4 + $r2 = LDA_dms_lda_idx $p0, killed $dj0 + JNZ $r0, %bb.1 + DelayedSchedBarrier + bb.2: + successors: %bb.3 + liveins: $r1 + $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + $r1 = MOVA_lda_cg 1023 + $r1 = MOVA_lda_cg 1023 + $r1 = MOVA_lda_cg 1023 + $r1 = MOVA_lda_cg 1023 + $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + bb.3: + successors: + liveins: $r3, $p0 + RET implicit $lr + DelayedSchedBarrier implicit $r3 +... + +# Test 2: now we have sufficient MOVA_lda_cg to cause resource +# conflicts to LDA_dms_lda_idx, so we need to shift sufficiently +# the scoreboard to a safe point (NOPS) + +--- +name: nopsInducedByConflicts +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: nopsInducedByConflicts + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $p0, $r0, $r1, $dj0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: JNZ $r0, %bb.1 + ; CHECK-NEXT: $p0 = PADDA_lda_ptr_inc_idx_imm killed $p0, 4 + ; CHECK-NEXT: $p0 = PADDA_lda_ptr_inc_idx_imm killed $p0, 4 + ; CHECK-NEXT: $p0 = PADDA_lda_ptr_inc_idx_imm killed $p0, 4 + ; CHECK-NEXT: $p0 = PADDA_lda_ptr_inc_idx_imm killed $p0, 4 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx $p0, $dj0 + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: $r1 = MOVA_lda_cg 1023 + ; CHECK-NEXT: $r1 = MOVA_lda_cg 1023 + ; CHECK-NEXT: $r1 = MOVA_lda_cg 1023 + ; CHECK-NEXT: $r1 = MOVA_lda_cg 1023 + ; CHECK-NEXT: $r1 = MOVA_lda_cg 1023 + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: liveins: $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier implicit killed $r3 + bb.0: + successors: %bb.1 + liveins: $p0 + bb.1: + liveins: $p0, $r0, $r1, $dj0 + successors: %bb.1, %bb.2 + $p0 = PADDA_lda_ptr_inc_idx_imm $p0, 4 + $p0 = PADDA_lda_ptr_inc_idx_imm $p0, 4 + $p0 = PADDA_lda_ptr_inc_idx_imm $p0, 4 + $p0 = PADDA_lda_ptr_inc_idx_imm $p0, 4 + $r2 = LDA_dms_lda_idx $p0, killed $dj0 + JNZ $r0, %bb.1 + DelayedSchedBarrier + bb.2: + successors: %bb.3 + liveins: $r1 + $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + $r1 = MOVA_lda_cg 1023 + $r1 = MOVA_lda_cg 1023 + $r1 = MOVA_lda_cg 1023 + $r1 = MOVA_lda_cg 1023 + $r1 = MOVA_lda_cg 1023 + $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + bb.3: + successors: + liveins: $r3 + RET implicit $lr + DelayedSchedBarrier implicit $r3 +... + +# Test 3: the same scenario as test 1, but we will cause a latency +# by introducing a data dependency (MUL_mul_r_rr and ADD_add_r_ri). +# This latency move the last MOVA_lda_cg to a conflicting point +# with LDA_dms_lda_idx. Ass effect, we need to shift all MOVA_lda_cg +# to a cycle without conflicts with LDA_dms_lda_idx (NOPS). + +--- +name: nopConflictsInducedByLatency +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: nopConflictsInducedByLatency + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $p0, $r0, $r1, $dj0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: JNZ $r0, %bb.1 + ; CHECK-NEXT: $p0 = PADDA_lda_ptr_inc_idx_imm killed $p0, 4 + ; CHECK-NEXT: $p0 = PADDA_lda_ptr_inc_idx_imm killed $p0, 4 + ; CHECK-NEXT: $p0 = PADDA_lda_ptr_inc_idx_imm killed $p0, 4 + ; CHECK-NEXT: $p0 = PADDA_lda_ptr_inc_idx_imm killed $p0, 4 + ; CHECK-NEXT: $r2 = LDA_dms_lda_idx $p0, $dj0 { + ; CHECK-NEXT: $r1 = MUL_mul_r_rr killed $r1, $r1 + ; CHECK-NEXT: } + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: $r1 = MOVA_lda_cg 1023 + ; CHECK-NEXT: $r1 = MOVA_lda_cg 1023 + ; CHECK-NEXT: $r1 = MOVA_lda_cg 1023 + ; CHECK-NEXT: $r1 = MOVA_lda_cg 1023 + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: liveins: $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier implicit killed $r3 + bb.0: + successors: %bb.1 + liveins: $p0 + bb.1: + liveins: $p0, $r0, $r1, $dj0 + successors: %bb.1, %bb.2 + $p0 = PADDA_lda_ptr_inc_idx_imm $p0, 4 + $p0 = PADDA_lda_ptr_inc_idx_imm $p0, 4 + $p0 = PADDA_lda_ptr_inc_idx_imm $p0, 4 + $p0 = PADDA_lda_ptr_inc_idx_imm $p0, 4 + $r2 = LDA_dms_lda_idx $p0, killed $dj0 + $r1 = MUL_mul_r_rr killed $r1, $r1 + JNZ $r0, %bb.1 + DelayedSchedBarrier + bb.2: + successors: %bb.3 + liveins: $r1 + $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + $r1 = MOVA_lda_cg 1023 + $r1 = MOVA_lda_cg 1023 + $r1 = MOVA_lda_cg 1023 + $r1 = MOVA_lda_cg 1023 + $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + $r1 = ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + bb.3: + successors: + liveins: $r3 + RET implicit $lr + DelayedSchedBarrier implicit $r3 +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-epilogue.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-epilogue.mir new file mode 100644 index 000000000000..e0c9aa687fa1 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-epilogue.mir @@ -0,0 +1,624 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py + +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 --issue-limit=6 --run-pass=postmisched \ +# RUN: --aie-interblock-scoreboard=false %s -o - \ +# RUN: | FileCheck %s + +# Check that inter-block latencies are respected by the loop-aware in +# loop/epilogue analysis. + +# Test 1: operand latency between MUL (loop) and ADD (epilogue). + +--- +name: safetyOneNopLatency +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: safetyOneNopLatency + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $r0, $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: JNZ $r0, %bb.1 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 1, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 2, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 3, implicit-def $srcarry + ; CHECK-NEXT: $r1 = MUL_mul_r_rr killed $r1, $r2 + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $r1 = ADD_add_r_ri $r2, 5, implicit-def $srcarry + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r2, 5, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 7, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 8, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 9, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 10, implicit-def $srcarry + ; CHECK-NEXT: DelayedSchedBarrier implicit killed $r1 + + + bb.0: + successors: %bb.1 + bb.1: + successors: %bb.2, %bb.1 + liveins: $r0, $r1, $r2 + $r2 = ADD_add_r_ri $r2, 1, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 2, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 3, implicit-def $srcarry + $r1 = MUL_mul_r_rr $r1, $r2 + JNZ $r0, %bb.1 + DelayedSchedBarrier + bb.2: + successors: + liveins: $r1, $r2 + $r1 = ADD_add_r_ri $r2, 5, implicit-def $srcarry + $r1 = ADD_add_r_ri $r2, 5, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 7, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 8, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 9, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 10, implicit-def $srcarry + RET implicit $lr + DelayedSchedBarrier implicit $r1 +... + +# Test 2: resource conflict between MUL (loop) and ADD (epilogue). + +--- +name: safetyOneNopResConflict +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: safetyOneNopResConflict + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $r0, $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: JNZ $r0, %bb.1 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 1, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 2, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 3, implicit-def $srcarry + ; CHECK-NEXT: $r1 = MUL_mul_r_rr killed $r1, $r2 + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 5, implicit-def $srcarry + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r2, 5, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 7, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 8, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 9, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 10, implicit-def $srcarry + ; CHECK-NEXT: DelayedSchedBarrier implicit killed $r1 + + + bb.0: + successors: %bb.1 + bb.1: + successors: %bb.2, %bb.1 + liveins: $r0, $r1, $r2 + $r2 = ADD_add_r_ri $r2, 1, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 2, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 3, implicit-def $srcarry + $r1 = MUL_mul_r_rr $r1, $r2 + JNZ $r0, %bb.1 + DelayedSchedBarrier + bb.2: + successors: + liveins: $r1, $r2 + $r2 = ADD_add_r_ri $r2, 5, implicit-def $srcarry + $r1 = ADD_add_r_ri $r2, 5, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 7, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 8, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 9, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 10, implicit-def $srcarry + RET implicit $lr + DelayedSchedBarrier implicit $r1 +... + +# Test 3: no resource conflict and no latency. + +--- +name: noResConflictNoLatency +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: noResConflictNoLatency + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $r0, $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: JNZ $r0, %bb.1 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 1, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 2, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 3, implicit-def $srcarry + ; CHECK-NEXT: $r1 = MUL_mul_r_rr killed $r1, $r2 + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r2, 5, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 7, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 8, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 9, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 10, implicit-def $srcarry + ; CHECK-NEXT: DelayedSchedBarrier implicit killed $r1 + + + bb.0: + successors: %bb.1 + bb.1: + successors: %bb.2, %bb.1 + liveins: $r0, $r1, $r2 + $r2 = ADD_add_r_ri $r2, 1, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 2, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 3, implicit-def $srcarry + $r1 = MUL_mul_r_rr $r1, $r2 + JNZ $r0, %bb.1 + DelayedSchedBarrier + bb.2: + successors: + liveins: $r1, $r2 + $r1 = ADD_add_r_ri $r2, 5, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 7, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 8, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 9, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 10, implicit-def $srcarry + RET implicit $lr + DelayedSchedBarrier implicit $r1 +... + + +# Test 4: test effects of long latency. + +--- +name: safetyNopsLongLatency +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: safetyNopsLongLatency + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $p0, $r0, $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: JNZ $r0, %bb.1 + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 1, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 2, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 3, implicit-def $srcarry + ; CHECK-NEXT: $dj0 = MOV_mv_scl $r2 + ; CHECK-NEXT: $r1 = LDA_dms_lda_idx $p0, killed $dj0 + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r1, 5, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 7, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r2, 8, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 9, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 10, implicit-def $srcarry + ; CHECK-NEXT: DelayedSchedBarrier implicit killed $r1 + + + bb.0: + successors: %bb.1 + bb.1: + successors: %bb.2, %bb.1 + liveins: $p0, $r0, $r1, $r2 + $r2 = ADD_add_r_ri $r2, 1, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 2, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 3, implicit-def $srcarry + $dj0 = MOV_mv_scl $r2 + $r1 = LDA_dms_lda_idx $p0, $dj0 + JNZ $r0, %bb.1 + DelayedSchedBarrier + bb.2: + successors: + liveins: $r1, $r2 + $r2 = ADD_add_r_ri $r1, 5, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 7, implicit-def $srcarry + $r1 = ADD_add_r_ri $r2, 8, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 9, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 10, implicit-def $srcarry + RET implicit $lr + DelayedSchedBarrier implicit $r1 +... + +# Test 5: test effects of long latency and data dependency +# with an instruction inside of the conflict horizon. + +--- +name: safetyNopsLongLatencyReduced +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: safetyNopsLongLatencyReduced + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $p0, $r0, $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: JNZ $r0, %bb.1 + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 1, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 2, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 3, implicit-def $srcarry + ; CHECK-NEXT: $dj0 = MOV_mv_scl $r2 + ; CHECK-NEXT: $r1 = LDA_dms_lda_idx $p0, killed $dj0 + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 5, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 7, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r2, 8, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 9, implicit-def $srcarry + ; CHECK-NEXT: $r1 = ADD_add_r_ri killed $r1, 10, implicit-def $srcarry + ; CHECK-NEXT: DelayedSchedBarrier implicit killed $r1 + + + bb.0: + successors: %bb.1 + bb.1: + successors: %bb.2, %bb.1 + liveins: $p0, $r0, $r1, $r2 + $r2 = ADD_add_r_ri $r2, 1, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 2, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 3, implicit-def $srcarry + $dj0 = MOV_mv_scl $r2 + $r1 = LDA_dms_lda_idx $p0, $dj0 + JNZ $r0, %bb.1 + DelayedSchedBarrier + bb.2: + successors: + liveins: $r1, $r2 + $r2 = ADD_add_r_ri $r2, 5, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 7, implicit-def $srcarry + $r1 = ADD_add_r_ri $r2, 8, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 9, implicit-def $srcarry + $r1 = ADD_add_r_ri $r1, 10, implicit-def $srcarry + RET implicit $lr + DelayedSchedBarrier implicit $r1 +... + + +# Test 6: test effects of long latency and no data dependency. +# neither resource conflict. + +--- +name: noResConflictNoLatencyTwo +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: noResConflictNoLatencyTwo + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $p0, $r0, $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: JNZ $r0, %bb.1 + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 1, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 2, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 3, implicit-def $srcarry + ; CHECK-NEXT: $dj0 = MOV_mv_scl $r2 + ; CHECK-NEXT: $r1 = LDA_dms_lda_idx $p0, killed $dj0 + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 5, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 7, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 8, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 9, implicit-def $srcarry + ; CHECK-NEXT: $r2 = ADD_add_r_ri killed $r2, 10, implicit-def $srcarry + ; CHECK-NEXT: DelayedSchedBarrier implicit killed $r1 + + + bb.0: + successors: %bb.1 + bb.1: + successors: %bb.2, %bb.1 + liveins: $p0, $r0, $r1, $r2 + $r2 = ADD_add_r_ri $r2, 1, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 2, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 3, implicit-def $srcarry + $dj0 = MOV_mv_scl $r2 + $r1 = LDA_dms_lda_idx $p0, $dj0 + JNZ $r0, %bb.1 + DelayedSchedBarrier + bb.2: + successors: + liveins: $r1, $r2 + $r2 = ADD_add_r_ri $r2, 5, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 7, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 8, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 9, implicit-def $srcarry + $r2 = ADD_add_r_ri $r2, 10, implicit-def $srcarry + RET implicit $lr + DelayedSchedBarrier implicit $r1 +... + +# Test 7: in case of an empty epilogue, this block must contain enough nops +# to clear the path for the correct execution of the epilogue's successor. + +--- +name: emptyEpilogue +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: emptyEpilogue + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $p0, $r0, $r1, $r2, $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: JNZ $r0, %bb.1 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $dj0 = MOV_mv_scl $r2 + ; CHECK-NEXT: $r1 = LDA_dms_lda_idx $p0, killed $dj0 + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $r1, $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: liveins: $r1, $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r2 = nsw ADD_add_r_ri $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: liveins: $r1, $r2, $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier implicit killed $r3 + + + bb.0: + successors: %bb.1 + liveins: $p0 + bb.1: + successors: %bb.2, %bb.1 + liveins: $p0, $r0, $r1, $r2, $r3 + $dj0 = MOV_mv_scl $r2 + $r1 = LDA_dms_lda_idx $p0, $dj0 + JNZ $r0, %bb.1 + DelayedSchedBarrier + bb.2: + successors: %bb.3 + liveins: $r1, $r3 + bb.3: + successors: %bb.4 + liveins: $r1, $r3 + $r2 = nsw ADD_add_r_ri $r1, -1, implicit-def $srcarry + bb.4: + successors: + liveins: $r1, $r2, $r3 + RET implicit $lr + DelayedSchedBarrier implicit $r3 +... + +# Test 8: if we have just one instruction in the epilogue, we need to be +# sure that resource conflicts and latencies will be solved until the +# execution of epilogue's successor. + +--- +name: oneInstrEpilogue +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: oneInstrEpilogue + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $p0, $r0, $r1, $r2, $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: JNZ $r0, %bb.1 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $dj0 = MOV_mv_scl $r2 + ; CHECK-NEXT: $r1 = LDA_dms_lda_idx $p0, killed $dj0 + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $r1, $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $r2 = nsw ADD_add_r_ri $r3, -1, implicit-def $srcarry + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: liveins: $r1, $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r2 = nsw ADD_add_r_ri $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: liveins: $r1, $r2, $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $r2 = nsw ADD_add_r_ri $r3, -1, implicit-def $srcarry + ; CHECK-NEXT: DelayedSchedBarrier implicit killed $r3 + + + bb.0: + successors: %bb.1 + liveins: $p0 + bb.1: + successors: %bb.2, %bb.1 + liveins: $p0, $r0, $r1, $r2, $r3 + $dj0 = MOV_mv_scl $r2 + $r1 = LDA_dms_lda_idx $p0, $dj0 + JNZ $r0, %bb.1 + DelayedSchedBarrier + bb.2: + successors: %bb.3 + liveins: $r1, $r3 + $r2 = nsw ADD_add_r_ri $r3, -1, implicit-def $srcarry + bb.3: + successors: %bb.4 + liveins: $r1, $r3 + $r2 = nsw ADD_add_r_ri $r1, -1, implicit-def $srcarry + bb.4: + successors: + liveins: $r1, $r2, $r3 + $r2 = nsw ADD_add_r_ri $r3, -1, implicit-def $srcarry + RET implicit $lr + DelayedSchedBarrier implicit $r3 +... + +# Test 9: the goal here is to test the resolution of the write port conflict. +# In this case, both LDA_dms_lda_idx and MUL_mul_r_rr can conflict with this +# resource, but as MUL uses this port in a stage later than ADD, for example, +# it is possible to issue the MUL earlier. + +--- +name: mulSchedEarly +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: mulSchedEarly + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $p0, $r0, $r1, $r2, $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: JNZ $r0, %bb.1 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $dj0 = MOV_mv_scl $r2 + ; CHECK-NEXT: $r1 = LDA_dms_lda_idx $p0, killed $dj0 + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $r1, $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $r2 = MUL_mul_r_rr $r3, $r3 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: $r2 = nsw ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: liveins: $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier implicit killed $r3 + + + bb.0: + successors: %bb.1 + liveins: $p0 + bb.1: + successors: %bb.2, %bb.1 + liveins: $p0, $r0, $r1, $r2, $r3 + $dj0 = MOV_mv_scl $r2 + $r1 = LDA_dms_lda_idx $p0, $dj0 + JNZ $r0, %bb.1 + DelayedSchedBarrier + bb.2: + successors: %bb.3 + liveins: $r1, $r3 + $r2 = MUL_mul_r_rr $r3, $r3 + $r2 = nsw ADD_add_r_ri $r1, -1, implicit-def $srcarry + bb.3: + successors: + liveins: $r3 + RET implicit $lr + DelayedSchedBarrier implicit $r3 +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/neglat-shortloop.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/neglat-shortloop.mir index 6b12f5492119..20c32872863b 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/neglat-shortloop.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/neglat-shortloop.mir @@ -41,9 +41,6 @@ body: | ; CHECK-NEXT: bb.2: ; CHECK-NEXT: liveins: $r0, $r10 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: NOP - ; CHECK-NEXT: NOP - ; CHECK-NEXT: NOP ; CHECK-NEXT: RET implicit $lr ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP diff --git a/llvm/test/CodeGen/AIE/schedule/bundled-me-mutation.mir b/llvm/test/CodeGen/AIE/schedule/bundled-me-mutation.mir new file mode 100644 index 000000000000..d75eab40e931 --- /dev/null +++ b/llvm/test/CodeGen/AIE/schedule/bundled-me-mutation.mir @@ -0,0 +1,55 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py + +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + + +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 --issue-limit=6 --run-pass=postmisched \ +# RUN: --aie-interblock-scoreboard=false %s -o - | FileCheck %s + +# Check that MemoryEdges mutation can process bundled instructions +# without misclassifying non-memory operations as memory operations. +# In this test, PADDB_ldb_ptr_inc_nrm_imm cannot be considered a memory +# operation by being bundled with ST_dms_sts_idx_imm. +# In case of a misclassification, the compiler will crash. + +# Note: Comments sometimes refer to cycles. The first cycle is cycle 0 + +--- +name: testMemoryEdgeMutationWithBundle +alignment: 16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0, $p1, $r1 + ; CHECK-LABEL: name: testMemoryEdgeMutationWithBundle + ; CHECK: liveins: $p0, $p1, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r0 = LDA_dms_lda_idx_imm killed $p0, 0 + ; CHECK-NEXT: RET implicit $lr + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: ST_dms_sts_idx_imm $r0, killed $p1, 0 + ; CHECK-NEXT: $p0 = PADDB_ldb_ptr_inc_nrm_imm $p0, 4 { + ; CHECK-NEXT: $r1 = nsw ADD_add_r_ri killed $r1, -1, implicit-def $srcarry + ; CHECK-NEXT: } + ; CHECK-NEXT: DelayedSchedBarrier implicit killed $r0 + $r0 = LDA_dms_lda_idx_imm $p0, 0 + $r1 = nsw ADD_add_r_ri $r1, -1, implicit-def $srcarry + $p0 = PADD_imm_pseudo $p0, 4 { + ST_dms_sts_idx_imm $r0, $p1, 0 + } + RET implicit $lr + DelayedSchedBarrier implicit $r0 +... diff --git a/llvm/test/CodeGen/AIE/schedule/bundled-ree-mutation.mir b/llvm/test/CodeGen/AIE/schedule/bundled-ree-mutation.mir new file mode 100644 index 000000000000..07bbbaabf292 --- /dev/null +++ b/llvm/test/CodeGen/AIE/schedule/bundled-ree-mutation.mir @@ -0,0 +1,75 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py + +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + + +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 --issue-limit=6 --run-pass=postmisched \ +# RUN: --aie-interblock-scoreboard=false %s -o - | FileCheck %s + +# Check that RegionEndEdges mutation can process bundled instructions +# without misclassifying non control flow operations as control flow ones. +# (with delay slot). When we misclassify, we can end up with instructions +# with EdgeLatency > DelaySlots, which is invalid. This test also must +# not cause a crash. In this specific case, LDA_dms_lda_idx_imm must not be +# considered with delay slot because it is bundled with JNZ. + +--- +name: testRegionEdgeEndMutationWithBundle +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: testRegionEdgeEndMutationWithBundle + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $p0, $p1, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r0 = LDA_dms_lda_idx_imm $p0, 0 { + ; CHECK-NEXT: JNZ $r1, %bb.0 + ; CHECK-NEXT: } + ; CHECK-NEXT: $r0 = LDA_dms_lda_idx_imm $p0, 0 + ; CHECK-NEXT: $r0 = LDA_dms_lda_idx_imm $p0, 0 + ; CHECK-NEXT: $r0 = LDA_dms_lda_idx_imm $p0, 0 + ; CHECK-NEXT: $r0 = LDA_dms_lda_idx_imm $p0, 0 + ; CHECK-NEXT: $r0 = LDA_dms_lda_idx_imm $p0, 0 + ; CHECK-NEXT: DelayedSchedBarrier + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $p0, $p1, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: DelayedSchedBarrier + bb.0: + liveins: $p0, $p1, $r1 + $r0 = LDA_dms_lda_idx_imm $p0, 0 + $r0 = LDA_dms_lda_idx_imm $p0, 0 + $r0 = LDA_dms_lda_idx_imm $p0, 0 + $r0 = LDA_dms_lda_idx_imm $p0, 0 + $r0 = LDA_dms_lda_idx_imm $p0, 0 + $r0 = LDA_dms_lda_idx_imm $p0, 0 + JNZ $r1, %bb.0 + DelayedSchedBarrier + bb.1: + liveins: $p0, $p1, $r1 + NOP + NOP + NOP + NOP + NOP + NOP + DelayedSchedBarrier +...