[AIEX] Create SUs for top-fixed instructions

Also add all related dependencies to have safety margins.
Xilinx · Dec 10, 2024 · 04a2805 · 04a2805
1 parent c1dc7df
commit 04a2805
Show file tree

Hide file tree

Showing 18 changed files with 417 additions and 54 deletions.
diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
@@ -20,6 +20,7 @@
 #include "AIEMachineScheduler.h"
 #include "AIEMaxLatencyFinder.h"
 #include "AIESubtarget.h"
+#include "Utils/AIELoopUtils.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
@@ -314,20 +315,102 @@ class RegionEndEdges : public ScheduleDAGMutation {
 /// "fixed" SUnits.
 class EmitFixedSUnits : public ScheduleDAGMutation {
 public:
+  struct RegAvailabilityTracker {
+    using fixed_iterator = MachineBasicBlock::iterator;
+    const InstrItineraryData *InstrItins;
+    const TargetRegisterInfo *TRI;
+    std::map<MCRegister, unsigned> RegisterToCycle;
+
+    RegAvailabilityTracker(const InstrItineraryData *Itins,
+                           const TargetRegisterInfo *TRI)
+        : InstrItins(Itins), TRI(TRI) {};
+
+    void updateRegisterAvailability(Register Reg, unsigned Latency) {
+      for (MCRegAliasIterator Ali(Reg, TRI, true); Ali.isValid(); ++Ali) {
+        auto RegCycle = RegisterToCycle.find(*Ali);
+        if (RegCycle == RegisterToCycle.end()) {
+          RegisterToCycle[*Ali] = Latency;
+        } else {
+          RegisterToCycle[*Ali] = std::max(RegCycle->second, Latency);
+        }
+      }
+    }
+
+    // This function calculates the cycles in which each register will be last
+    // materialized, taking into account a specified sequence of timed bundles.
+    // PastTheEndCycles is utilized to project the availability cycle to the
+    // subsequent region.
+    void computeAvailabilityCycles(ArrayRef<AIE::MachineBundle> Bundles,
+                                   bool PastTheEndCycles) {
+      int Cycle = 0;
+      const int TotalCycles = Bundles.size();
+      for (const auto &Bundle : Bundles) {
+        for (MachineInstr *SrcMI : Bundle.getInstrs()) {
+          for (unsigned OpNum = 0; OpNum < SrcMI->getNumOperands(); OpNum++) {
+            unsigned SrcClass = SrcMI->getDesc().getSchedClass();
+            const MachineOperand &MO = SrcMI->getOperand(OpNum);
+            if (!MO.isReg() || !MO.isDef())
+              continue;
+            std::optional<unsigned> OptSrcCycle =
+                InstrItins->getOperandCycle(SrcClass, OpNum);
+            assert(OptSrcCycle);
+            int Latency = *OptSrcCycle;
+            int MaterializationCycle = 0;
+            if (PastTheEndCycles) {
+              int RemainingCycles = TotalCycles - Cycle - 1;
+              MaterializationCycle = std::max(Latency - RemainingCycles - 1, 0);
+            } else {
+              MaterializationCycle = Cycle + Latency;
+            }
+            updateRegisterAvailability(MO.getReg(), MaterializationCycle);
+          }
+        }
+        Cycle++;
+      }
+    }
+
+    unsigned getMaxSrcOperandLatency(const MachineInstr &MI) const {
+      unsigned MaxLatency = 0;
+      for (const MachineOperand &MO : MI.all_uses()) {
+        if (!MO.isReg() || !MO.isUse())
+          continue;
+        for (MCRegAliasIterator Ali(MO.getReg(), TRI, true); Ali.isValid();
+             ++Ali) {
+          auto RegCycle = RegisterToCycle.find(*Ali);
+          if (RegCycle != RegisterToCycle.end()) {
+            MaxLatency = std::max(RegCycle->second, MaxLatency);
+          }
+        }
+      }
+      return MaxLatency;
+    }
+  };
+
   void apply(ScheduleDAGInstrs *DAG) override {
     AIEPostRASchedStrategy *Scheduler =
         static_cast<AIEScheduleDAGMI *>(DAG)->getSchedImpl();
     auto *TII = static_cast<const AIEBaseInstrInfo *>(DAG->TII);
     auto *ItinData = DAG->MF.getSubtarget().getInstrItineraryData();
+    const TargetRegisterInfo *TRI = DAG->MF.getSubtarget().getRegisterInfo();
     const BlockState &BS =
         Scheduler->getInterBlock().getBlockState(DAG->getBB());
     const Region &CurRegion = BS.getCurrentRegion();
+    RegAvailabilityTracker RAT{ItinData, TRI};
 
     // First, create SUnits for all "fixed" instructions
     // Those will be chained from/to the EntrySU/ExitSU to ensure they are
     // placed in the correct cycle. The scheduler will enforce that these fixed
     // SUnits get placed exactly at their depth (for the Top zone) or height
     // (for the Bot zone).
+    SUnit *Pred = &DAG->EntrySU;
+    for (MachineInstr &MI : CurRegion.top_fixed_instrs()) {
+      SUnit &FixedSU = Scheduler->addFixedSUnit(MI, /*IsTop=*/true);
+      SDep Dep(Pred, SDep::Artificial);
+      Dep.setLatency(Pred == &DAG->EntrySU ? 0 : 1);
+      FixedSU.addPred(Dep);
+      Pred = &FixedSU;
+    }
+
     SUnit *Succ = &DAG->ExitSU;
     for (MachineInstr &MI : reverse(CurRegion.bot_fixed_instrs())) {
       SUnit &FixedSU = Scheduler->addFixedSUnit(MI, /*IsTop=*/false);
@@ -359,6 +442,65 @@ class EmitFixedSUnits : public ScheduleDAGMutation {
           AIE::maxLatency(&MI, *TII, *ItinData, /*IncludeStages=*/true));
       FixedDepSU->addPred(Dep, /*Required=*/true);
     }
+
+    // We only need to focus on top-fixed instructions when there is an Epilog
+    // block.
+    if (BS.Kind != BlockType::Epilogue)
+      return;
+
+    MachineBasicBlock *Loop = AIELoopUtils::getLoopPredecessor(*DAG->getBB());
+    assert(Loop);
+    const BlockState &LBS = Scheduler->getInterBlock().getBlockState(Loop);
+    assert(LBS.Kind == BlockType::Loop);
+
+    if (!LBS.isPipelined())
+      return;
+
+    ArrayRef<AIE::MachineBundle> LoopTimedBundles = LBS.getTop().Bundles;
+    ArrayRef<AIE::MachineBundle> TopFixedBundles =
+        CurRegion.getTopFixedBundles();
+
+    RAT.computeAvailabilityCycles(TopFixedBundles, /*PastTheEndCycles*/ false);
+    // It is more cost-effective to reuse the RAT to establish individual safety
+    // margins between the pipelined loop and the free instructions. This
+    // approach allows us to manage all dependencies related to EntrySU in one
+    // centralized location. While it is possible to implement this as a
+    // separate mutator, doing so could be costly, as it would prevent the
+    // creation of multiple edges from EntrySU to each free instruction that
+    // depends on both timed regions (TopFixed and LoopTimed).
+    RAT.computeAvailabilityCycles(LoopTimedBundles, /*PastTheEndCycles*/ true);
+
+    auto IsNotTopFixedSU = [Scheduler](const SUnit &SU) {
+      return !Scheduler->isFixedSU(SU, true);
+    };
+
+    // Establish dependencies for each non top-fixed sched. unit by taking into
+    // account the availability cycle of each operand.
+    for (SUnit &SU : make_filter_range(DAG->SUnits, IsNotTopFixedSU)) {
+      const MachineInstr &MI = *SU.getInstr();
+      if (const unsigned Latency = RAT.getMaxSrcOperandLatency(MI)) {
+        SDep Dep(&DAG->EntrySU, SDep::Artificial);
+        Dep.setLatency(Latency);
+        SU.addPred(Dep, /*Required=*/true);
+      }
+    }
+
+    auto IsTopFixedSU = [Scheduler](const SUnit &SU) {
+      return Scheduler->isFixedSU(SU, true);
+    };
+
+    // Establish dependencies to ExitSU for each top-fixed sched. unit by taking
+    // into account MaxLatency.
+    for (SUnit &FixedSU : make_filter_range(DAG->SUnits, IsTopFixedSU)) {
+      const MachineInstr &MI = *FixedSU.getInstr();
+      if (const unsigned Latency = RAT.getMaxSrcOperandLatency(MI)) {
+        SDep Dep(&FixedSU, SDep::Artificial);
+        int latency =
+            AIE::maxLatency(&MI, *TII, *ItinData, /*IncludeStages=*/true);
+        Dep.setLatency(latency);
+        DAG->ExitSU.addPred(Dep, /*Required=*/true);
+      }
+    }
   }
 };
 

diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
@@ -204,20 +204,6 @@ MachineInstr *checkResourceConflictsTopDown(
   return ConflictMI;
 }
 
-MachineBasicBlock *getLoopPredecessor(const MachineBasicBlock &MBB) {
-  if (MBB.pred_size() == 1) {
-    // if we have only one, it must be the loop
-    return *MBB.predecessors().begin();
-  }
-  // Otherwise, the loop is the fallthrough predecessor by construction
-  for (auto *Pred : MBB.predecessors()) {
-    if (Pred->isLayoutSuccessor(&MBB)) {
-      return Pred;
-    }
-  }
-  return nullptr;
-}
-
 InterBlockScheduling::InterBlockScheduling(const MachineSchedContext *C,
                                            bool InterBlock)
     : Context(C), InterBlockScoreboard(InterBlock) {}
@@ -809,7 +795,7 @@ void InterBlockScheduling::emitInterBlockSafetyMargin(
   }
 
   MachineBasicBlock *BB = BS.TheBlock;
-  MachineBasicBlock *Loop = getLoopPredecessor(*BB);
+  MachineBasicBlock *Loop = AIELoopUtils::getLoopPredecessor(*BB);
   assert(Loop);
   const BlockState &LBS = getBlockState(Loop);
 
@@ -833,7 +819,7 @@ void InterBlockScheduling::emitInterBlockTop(BlockState &BS) {
   }
 
   MachineBasicBlock *BB = BS.TheBlock;
-  MachineBasicBlock *Loop = getLoopPredecessor(*BB);
+  MachineBasicBlock *Loop = AIELoopUtils::getLoopPredecessor(*BB);
   assert(Loop);
   const BlockState &LBS = getBlockState(Loop);
 

diff --git a/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp b/llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
@@ -30,6 +30,21 @@ static cl::opt<bool>
 // operand and the memory latency. Include the stage latency if requested.
 int maxLatency(const MachineInstr *MI, const AIEBaseInstrInfo &InstrInfo,
                const InstrItineraryData &Itineraries, bool IncludeStages) {
+
+  // If we have a Bundle, query maxLatency for each bundled instruction.
+  if (MI->getOpcode() == TargetOpcode::BUNDLE) {
+    int BundleLatency = 0;
+    MachineBasicBlock::const_instr_iterator I = MI->getIterator();
+    MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+    while (++I != E && I->isBundledWithPred()) {
+      assert(!I->isBundle() && "No nested bundle!");
+      BundleLatency =
+          std::max(maxLatency(&(*I), InstrInfo, Itineraries, IncludeStages),
+                   BundleLatency);
+    }
+    return BundleLatency;
+  }
+
   int Latency = 0;
   unsigned SrcClass = MI->getDesc().getSchedClass();
   for (unsigned I = 0;; I++) {

diff --git a/llvm/lib/Target/AIE/Utils/AIELoopUtils.cpp b/llvm/lib/Target/AIE/Utils/AIELoopUtils.cpp
@@ -116,4 +116,18 @@ bool isSingleMBBLoop(const MachineBasicBlock *MBB) {
   return NumLoopEdges == 1 && NumExitEdges == 1;
 }
 
+MachineBasicBlock *getLoopPredecessor(const MachineBasicBlock &MBB) {
+  if (MBB.pred_size() == 1) {
+    // if we have only one, it must be the loop
+    return *MBB.predecessors().begin();
+  }
+  // Otherwise, the loop is the fallthrough predecessor by construction
+  for (auto *Pred : MBB.predecessors()) {
+    if (Pred->isLayoutSuccessor(&MBB)) {
+      return Pred;
+    }
+  }
+  return nullptr;
+}
+
 } // namespace llvm::AIELoopUtils
diff --git a/llvm/lib/Target/AIE/Utils/AIELoopUtils.h b/llvm/lib/Target/AIE/Utils/AIELoopUtils.h
@@ -48,6 +48,10 @@ getSingleBlockLoopMBBs(const MachineFunction &MF);
 /// Check if this block is a single block loop.
 bool isSingleMBBLoop(const MachineBasicBlock *MBB);
 
+/// Considering that MBB has a single predecessor that is a loop
+/// and also layout predecessor, return it.
+MachineBasicBlock *getLoopPredecessor(const MachineBasicBlock &MBB);
+
 } // namespace llvm::AIELoopUtils
 
 #endif
diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll
@@ -8,8 +8,8 @@
 ; RUN: llc -O2 -mtriple=aie2 --enable-pipeliner=1 --enable-aie-hardware-loops=false \
 ; RUN:     --enable-aie-zero-overhead-loops=false %s -o - | FileCheck %s --check-prefix=DCL
 ; RUN: llc -O2 -mtriple=aie2 --enable-pipeliner=1 %s -o - | FileCheck %s --check-prefix=ZOL
-; RUN: llc -O2 -mtriple=aie2 --enable-pipeliner=0 %s -o - --debug-only=machine-scheduler  \
-; RUN:    2>&1 | %imisched -d - \
+; RUN: llc -O2 -mtriple=aie2 --enable-pipeliner=0 --aie-postpipeliner-maxii=0 %s -o \
+; RUN:    - --debug-only=machine-scheduler 2>&1 | %imisched -d - \
 ; RUN:    | FileCheck %s --check-prefix=SCHED-DUMP
 
 ; Variation of the already existent test, but enabling SWP under different flavors:

diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store.mir
@@ -39,7 +39,8 @@
   ; CHECK-NEXT:  .L_LEnd0:
   ; CHECK-NEXT:    nopb ; nopa ; st r1, [p0], #4; add r1, r1, #1; nopm ; nopv
   ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup
-  ; CHECK-NEXT:    nopb ; nopa ; st r1, [p0], #4; nopxm ; nopv
+  ; CHECK-NEXT:    nopa ; nopb ; nopxm ; st r1, [p0], #4
+  ; CHECK-NEXT:    nop
   ; CHECK-NEXT:    .p2align 4
   ; CHECK-NEXT:  .LBB0_4: // %for.cond.cleanup
   ; CHECK-NEXT:    nopa ; ret lr

diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir
@@ -118,44 +118,38 @@
   ; CHECK-NEXT:    vldb wh7, [p1], #32; nopa ; nops ; nopx ; mov r6, p0; vmac cm3, cm3, x3, x10, r4
   ; CHECK-NEXT:  // %bb.3: // %outer.loop.latch
   ; CHECK-NEXT:    // in Loop: Header=BB0_1 Depth=1
-  ; CHECK-NEXT:    nopa ; and r5, r6, r9; vshift.align x4, x4, s1, x6, r0; vmac cm4, cm4, x5, x10, r4
-  ; CHECK-NEXT:    vmac cm5, cm5, x9, x7, r4
-  ; CHECK-NEXT:    add r0, r5, #33; vshift.align x2, x2, s1, x8, r0; vmac cm6, cm6, x1, x7, r4
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; and r5, r6, r9; vshift.align x4, x4, s1, x6, r0; vmac cm4, cm4, x5, x10, r4
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; add r7, r7, #-1; mov s3, r6; vmac cm5, cm5, x9, x7, r4
+  ; CHECK-NEXT:    nopa ; add r0, r5, #33; vshift.align x2, x2, s1, x8, r0; vmac cm6, cm6, x1, x7, r4
   ; CHECK-NEXT:    vshuffle x9, x4, x2, r2; vmac cm7, cm7, x3, x7, r4
   ; CHECK-NEXT:    vshuffle x1, x9, x0, r8; vmac cm0, cm0, x5, x7, r4
   ; CHECK-NEXT:    vshuffle x3, x4, x2, r3; vmac cm1, cm1, x9, x10, r4
   ; CHECK-NEXT:    vshuffle x5, x3, x0, r8; vmac cm2, cm2, x1, x10, r4
-  ; CHECK-NEXT:    vmac cm3, cm3, x3, x10, r4
-  ; CHECK-NEXT:    vmac cm4, cm4, x5, x10, r4
-  ; CHECK-NEXT:    vmac cm5, cm5, x9, x7, r4
-  ; CHECK-NEXT:    vmac cm6, cm6, x1, x7, r4
-  ; CHECK-NEXT:    vmac cm7, cm7, x3, x7, r4
-  ; CHECK-NEXT:    vmac cm0, cm0, x5, x7, r4
-  ; CHECK-NEXT:    nop
-  ; CHECK-NEXT:    nop
-  ; CHECK-NEXT:    nop
-  ; CHECK-NEXT:    vst.srs.s16.s32 bmh1, s2, [p3, #32]; mov s3, r6
-  ; CHECK-NEXT:    vst.srs.s16.s32 bml1, s3, [p3], #64
-  ; CHECK-NEXT:    vst.srs.s16.s32 bmh2, s3, [p3, #32]
+  ; CHECK-NEXT:    mov dc5, r26; vmac cm3, cm3, x3, x10, r4
+  ; CHECK-NEXT:    mov dn5, r27; vmac cm4, cm4, x5, x10, r4
+  ; CHECK-NEXT:    mov dj5, r28; vmac cm5, cm5, x9, x7, r4
+  ; CHECK-NEXT:    vst.srs.s16.s32 bmh1, s2, [p3, #32]; mov m1, r10; vmac cm6, cm6, x1, x7, r4
+  ; CHECK-NEXT:    vst.srs.s16.s32 bml1, s3, [p3], #64; mov m2, r13; vmac cm7, cm7, x3, x7, r4
+  ; CHECK-NEXT:    padda.3d [p1], d2; vst.srs.s16.s32 bmh2, s3, [p3, #32]; mov m3, r14; vmac cm0, cm0, x5, x7, r4
   ; CHECK-NEXT:    vst.srs.s16.s32 bml2, s3, [p3], m4
   ; CHECK-NEXT:    vst.srs.s16.s32 bmh3, s3, [p3, #32]
   ; CHECK-NEXT:    vst.srs.s16.s32 bml3, s3, [p3], #64
   ; CHECK-NEXT:    vst.srs.s16.s32 bmh4, s3, [p3, #32]
   ; CHECK-NEXT:    vst.srs.s16.s32 bml4, s3, [p3], m7
   ; CHECK-NEXT:    vst.srs.s16.s32 bmh5, s3, [p3, #32]
   ; CHECK-NEXT:    vst.srs.s16.s32 bml5, s3, [p3], #64
-  ; CHECK-NEXT:    vst.srs.s16.s32 bmh6, s3, [p3, #32]; mov dc5, r26
-  ; CHECK-NEXT:    vst.srs.s16.s32 bml6, s3, [p3], m4; mov dn5, r27
-  ; CHECK-NEXT:    vst.srs.s16.s32 bmh7, s3, [p3, #32]; mov dj5, r28
-  ; CHECK-NEXT:    vst.srs.s16.s32 bml7, s3, [p3], #64; mov m1, r10
-  ; CHECK-NEXT:    vst.srs.s16.s32 bmh0, s3, [p3, #32]; mov m2, r13
+  ; CHECK-NEXT:    vst.srs.s16.s32 bmh6, s3, [p3, #32]
+  ; CHECK-NEXT:    vst.srs.s16.s32 bml6, s3, [p3], m4
+  ; CHECK-NEXT:    vst.srs.s16.s32 bmh7, s3, [p3, #32]
+  ; CHECK-NEXT:    vst.srs.s16.s32 bml7, s3, [p3], #64
+  ; CHECK-NEXT:    vst.srs.s16.s32 bmh0, s3, [p3, #32]
   ; CHECK-NEXT:    vst.2d.srs.s16.s32 bml0, s3, [p3], d5; mov dj5, r11
-  ; CHECK-NEXT:    add r7, r7, #-1; mov dn5, r12
+  ; CHECK-NEXT:    mov dn5, r12
   ; CHECK-NEXT:    jnz r7, #.LBB0_1
   ; CHECK-NEXT:    mov r26, dc5 // Delay Slot 5
   ; CHECK-NEXT:    mov dc5, r25 // Delay Slot 4
   ; CHECK-NEXT:    padda.3d [p0], d1; mov m1, r24 // Delay Slot 3
-  ; CHECK-NEXT:    padda.3d [p1], d2; paddb [p2], m1; mov m3, r14 // Delay Slot 2
+  ; CHECK-NEXT:    paddb [p2], m1 // Delay Slot 2
   ; CHECK-NEXT:    padda.3d [p2], d3; mov r25, dc5 // Delay Slot 1
   ; CHECK-NEXT:  // %bb.4: // %exitStub
   ; CHECK-NEXT:    lda p7, [sp, #-32]; nopb ; nopxm // 4-byte Folded Reload