Xilinx · andcarminati · Jun 11, 2024 · May 29, 2024 · May 29, 2024 · Jun 4, 2024
@@ -53,8 +53,10 @@ namespace {
 const constexpr unsigned NumDelaySlots = 5;
 } // namespace
 
-unsigned AIEBaseInstrInfo::getNumDelaySlots(const MachineInstr &MI) const {
-  return MI.hasDelaySlot() ? NumDelaySlots : 0;
+unsigned
+AIEBaseInstrInfo::getNumDelaySlots(const MachineInstr &MI,
+                                   MachineInstr::QueryType Query) const {
+  return MI.hasDelaySlot(Query) ? NumDelaySlots : 0;
 }
 
 unsigned

@@ -77,7 +77,10 @@ struct AIEBaseInstrInfo : public TargetInstrInfo {
 
   /// Returns the number of delay slots that this instruction requires.
   /// This might be 0
-  virtual unsigned getNumDelaySlots(const MachineInstr &MI) const;
+  virtual unsigned
+  getNumDelaySlots(const MachineInstr &MI,
+                   MachineInstr::QueryType Query =
+                       MachineInstr::QueryType::AnyInBundle) const;
 
   /// Returns the number of delay slots that should be reserved, i.e.
   /// not filled in by the scheduler.

@@ -210,7 +210,7 @@ class RegionEndEdges : public ScheduleDAGMutation {
     }
   }
   void apply(ScheduleDAGInstrs *DAG) override {
-    AIE::MaxLatencyFinder MaxLatency(static_cast<AIEScheduleDAGMI *>(DAG));
+    AIE::MaxLatencyFinder MaxLatency(DAG);
 
     // Default edges to ExitSU are conservative, and can't be shrunk.
     // We really should know what we're doing here, so just remove and
@@ -224,7 +224,12 @@ class RegionEndEdges : public ScheduleDAGMutation {
 
       SDep ExitDep(&SU, SDep::Artificial);
 
-      unsigned DelaySlots = TII->getNumDelaySlots(MI);
+      // By using IgnoreBundle, we can safely apply this mutation to already
+      // bundled instructions without causing misclassification of instructions
+      // that are bundled with control flow ones. Otherwise, the assertion
+      // below can be triggered for correct cases.
+      unsigned DelaySlots =
+          TII->getNumDelaySlots(MI, MachineInstr::QueryType::IgnoreBundle);
       unsigned EdgeLatency = !DelaySlots && UserSetLatencyMargin
                                  ? UserLatencyMargin
                                  : MaxLatency(MI);
@@ -355,12 +360,15 @@ class PropagateIncomingLatencies : public ScheduleDAGMutation {
 class MemoryEdges : public ScheduleDAGMutation {
   void apply(ScheduleDAGInstrs *DAG) override {
     const auto *TII = static_cast<const AIEBaseInstrInfo *>(DAG->TII);
-
+    // Query individual instruction behavior. This is because we might create
+    // dependencies with already-scheduled blocks where Bundles have been
+    // created.
+    const auto QueryType = MachineInstr::QueryType::IgnoreBundle;
     // Run over all instructions that may load or store, and correct the
     // latencies for all their memory dependencies.
     for (SUnit &SU : DAG->SUnits) {
       MachineInstr &MI = *SU.getInstr();
-      if (!MI.mayLoadOrStore()) {
+      if (!MI.mayLoadOrStore(QueryType)) {
         continue;
       }
 
@@ -369,13 +377,14 @@ class MemoryEdges : public ScheduleDAGMutation {
 
         // Ignore non-memory dependencies. Locks or other instructions with side
         // effects aren't handled with MemInstrItinData itineraries.
-        if (!PredEdge.isNormalMemoryOrBarrier() || !SrcMI.mayLoadOrStore()) {
+        if (!PredEdge.isNormalMemoryOrBarrier() ||
+            !SrcMI.mayLoadOrStore(QueryType)) {
           continue;
         }
 
         // Ignore Load-Load (RAR) dependencies.
         // TODO: Those should probably be removed altogether.
-        if (!SrcMI.mayStore() && !MI.mayStore()) {
+        if (!SrcMI.mayStore(QueryType) && !MI.mayStore(QueryType)) {
           continue;
         }
 

@@ -32,8 +32,58 @@ static cl::opt<bool>
     LoopAware("aie-loop-aware", cl::init(true),
               cl::desc("[AIE] Schedule single block loops iteratively"));
 
+static cl::opt<bool> LoopEpilogueAnalysis(
+    "aie-loop-epilogue-analysis", cl::init(true),
+    cl::desc("[AIE] Perform Loop/Epilogue analysis with loop scheduling"));
+
 namespace llvm::AIE {
 
+void dumpInterBlock(const InterBlockEdges &Edges) {
+  for (const SUnit &SU : Edges) {
+    dbgs() << "SU" << SU.NodeNum << ": " << *SU.getInstr();
+  }
+}
+
+void emitBundlesInScoreboard(const std::vector<MachineBundle> &Bundles,
+                             ResourceScoreboard<FuncUnitWrapper> &Scoreboard,
+                             AIEHazardRecognizer *HR) {
+
+  const int TotalBundles = Bundles.size();
+  const int AmountToEmit = std::min(TotalBundles, HR->getConflictHorizon());
+  // Do not emit more than the specified by the conflict horizon. More
+  // then this will not cause conflicts.
+  for (int i = TotalBundles - AmountToEmit; i < TotalBundles; i++) {
+    for (MachineInstr *MI : Bundles[i].getInstrs())
+      HR->emitInScoreboard(Scoreboard, MI->getDesc(), 0);
+
+    Scoreboard.advance();
+  }
+}
+
+void emitBundlesInScoreboardDelta(
+    const std::vector<MachineBundle> &Bundles,
+    ResourceScoreboard<FuncUnitWrapper> &Scoreboard, int &Delta,
+    AIEHazardRecognizer *HR) {
+
+  for (auto &Bundle : Bundles) {
+    // We don't need to replay more instructions, because we exhausted the
+    // scoreboard.
+    if (Delta >= 0)
+      break;
+
+    for (MachineInstr *MI : Bundle.getInstrs())
+      HR->emitInScoreboard(Scoreboard, MI->getDesc(), Delta);
+
+    Delta++;
+  }
+}
+
+MachineBasicBlock *getSinglePredecessor(const MachineBasicBlock &MBB) {
+  assert(MBB.pred_size() == 1 && "MBB contains more than 1 predecessor");
+  MachineBasicBlock *SinglePredMBB = *MBB.predecessors().begin();
+  return SinglePredMBB;
+}
+
 InterBlockScheduling::InterBlockScheduling(const MachineSchedContext *C,
                                            bool InterBlock)
     : Context(C), InterBlockScoreboard(InterBlock) {}
@@ -118,12 +168,8 @@ bool InterBlockScheduling::resourcesConverged(BlockState &BS) const {
   ResourceScoreboard<FuncUnitWrapper> Bottom;
   Bottom.reset(Depth);
 
-  for (auto &Bundle : BS.getBottom().Bundles) {
-    for (MachineInstr *MI : Bundle.getInstrs()) {
-      HR->emitInScoreboard(Bottom, MI->getDesc(), 0);
-    }
-    Bottom.advance();
-  }
+  emitBundlesInScoreboard(BS.getBottom().Bundles, Bottom, HR.get());
+
   DEBUG_LOOPAWARE(dbgs() << "Bottom scoreboard\n"; Bottom.dump());
   // We have two successors, the loop itself and the epilogue
   assert(BS.TheBlock->succ_size() == 2);
@@ -138,15 +184,9 @@ bool InterBlockScheduling::resourcesConverged(BlockState &BS) const {
     ResourceScoreboard<FuncUnitWrapper> Top;
     Top.reset(Depth);
     int Cycle = -Depth;
-    for (auto &Bundle : BS.getBottom().Bundles) {
-      if (Cycle >= 0) {
-        break;
-      }
-      for (MachineInstr *MI : Bundle.getInstrs()) {
-        HR->emitInScoreboard(Top, MI->getDesc(), Cycle);
-      }
-      Cycle++;
-    }
+
+    emitBundlesInScoreboardDelta(BS.getBottom().Bundles, Top, Cycle, HR.get());
+
     DEBUG_LOOPAWARE(dbgs() << "Top scoreboard\n"; Top.dump());
     if (Bottom.conflict(Top, Depth)) {
       return false;
@@ -369,14 +409,152 @@ int InterBlockScheduling::getNumEntryNops(const BlockState &BS) const {
   }
   const MachineBasicBlock &BB = *BS.TheBlock;
   assert(BB.pred_size() == 1);
-  MachineBasicBlock *Loop = *BB.predecessors().begin();
+  MachineBasicBlock *Loop = getSinglePredecessor(BB);
   auto &LBS = getBlockState(Loop);
 
-  // TODO: we can do better by doing full interblock analysis
-  // between BS and LBS
+  // We can only analyze non-empty epilogue blocks because we need
+  // to build a DDG, which is not possible.
+  // For empty ones, we need to be conservative because we are not aware of
+  // content of epilogues' successor.
+  if (LoopEpilogueAnalysis && BB.size() > 0) {
+    int ExistingLatency = getCyclesToRespectTiming(BS, LBS);
+    // Start the next step only after clearing latencies.
+    return getCyclesToAvoidResourceConflicts(ExistingLatency, BS, LBS);
+  }
+
   return LBS.getSafetyMargin();
 }
 
+int InterBlockScheduling::getCyclesToRespectTiming(
+    const BlockState &EpilogueBS, const BlockState &LoopBS) const {
+
+  const MachineBasicBlock &EpilogueMBB = *EpilogueBS.TheBlock;
+  const MachineBasicBlock *LoopMBB = getSinglePredecessor(EpilogueMBB);
+
+  DEBUG_LOOPAWARE(dbgs() << "** Loop/Epilogue-carried latency dependencies:"
+                         << " Original Loop " << *LoopMBB
+                         << " Original Epilogue " << EpilogueMBB << "\n");
+
+  InterBlockEdges Edges(*Context);
+  std::map<const MachineInstr *, int> DistancesFromLoopEntry;
+  int DistFromLoopEntry = 0;
+  int EntryNops = 0;
+
+  auto AddRegionToEdges = [&](const Region &R) {
+    for (auto &Bundle : R.Bundles) {
+      for (MachineInstr *MI : Bundle.getInstrs()) {
+        DistancesFromLoopEntry[MI] = DistFromLoopEntry;
+        Edges.addNode(MI);
+      }
+      ++DistFromLoopEntry;
+    }
+  };
+
+  // Construction of the superblock containing Loop+Epilogue
+  // First part is the loop
+  AddRegionToEdges(LoopBS.getBottom());
+  Edges.markBoundary();
+  // Second part is the epilogue itself
+  AddRegionToEdges(EpilogueBS.getTop());
+  Edges.buildEdges();
+
+  DEBUG_LOOPAWARE(dumpInterBlock(Edges));
+  // Check cross-boundary latencies.
+  int Height = 1;
+  for (auto &Bundle : reverse(LoopBS.getBottom().Bundles)) {
+    for (auto *PreBoundaryMI : Bundle.getInstrs()) {
+      const SUnit *Pred = Edges.getPreBoundaryNode(PreBoundaryMI);
+
+      for (auto &SDep : Pred->Succs) {
+        auto *Succ = SDep.getSUnit();
+
+        if (!Edges.isPostBoundaryNode(Succ))
+          continue;
+
+        const MachineInstr *PostBoundaryMI = Succ->getInstr();
+
+        const int PostBoundOrExitDist =
+            (PostBoundaryMI != nullptr)
+                ? DistancesFromLoopEntry[PostBoundaryMI]
+                // When getInstr returns nullptr, we reached
+                // ExitSU. We can consider the DistFromLoopEntry as
+                // depth of the ExitSU.
+                : DistFromLoopEntry;
+
+        const int Latency = SDep.getSignedLatency();
+        const int Distance =
+            PostBoundOrExitDist - DistancesFromLoopEntry[PreBoundaryMI];
+
+        DEBUG_LOOPAWARE(dbgs() << "Data dependency found:\n"
+                               << " Loop instruction SU: " << *PreBoundaryMI);
+        DEBUG_LOOPAWARE(dbgs() << " Epilogue instruction: ";
+                        if (PostBoundaryMI) PostBoundaryMI->dump();
+                        else dbgs() << "nullptr (ExitSU)";);
+        DEBUG_LOOPAWARE(dbgs() << "\n Latency: " << Latency
+                               << "\n Distance: " << Distance << "\n");
+
+        EntryNops = std::max(EntryNops, Latency - Distance);
+      }
+    }
+    if (++Height > HR->getConflictHorizon()) {
+      break;
+    }
+  }
+
+  return EntryNops;
+}
+
+int InterBlockScheduling::getCyclesToAvoidResourceConflicts(
+    int ExistingLatency, const BlockState &EpilogueBS,
+    const BlockState &LoopBS) const {
+
+  const MachineBasicBlock &EpilogueMBB = *EpilogueBS.TheBlock;
+  MachineBasicBlock *LoopMBB = LoopBS.TheBlock;
+  int Depth = HR->getMaxLookAhead();
+  ResourceScoreboard<FuncUnitWrapper> Bottom;
+  Bottom.reset(Depth);
+
+  DEBUG_LOOPAWARE(dbgs() << "* Loop/Epilogue-carried resource conflicts:"
+                         << " Original Loop " << *LoopMBB << " Original Epilog "
+                         << EpilogueMBB << "\n");
+
+  emitBundlesInScoreboard(LoopBS.getBottom().Bundles, Bottom, HR.get());
+
+  // We know how many latency cycles we need to respect, and we can advance
+  // the scoreboard to the first possible cycle that can accommodate another
+  // instruction and start the resource verification from this point, tracking
+  // the number of NOPS.
+  int NopCounter = 0;
+  for (NopCounter = 0; NopCounter < ExistingLatency; ++NopCounter)
+    Bottom.advance();
+
+  DEBUG_LOOPAWARE(dbgs() << "Loop scoreboard\n"; Bottom.dump());
+
+  ResourceScoreboard<FuncUnitWrapper> Top;
+  Top.reset(Depth);
+  int Cycle = -Depth;
+
+  auto Bundles = EpilogueBS.getBottom().Bundles;
+
+  emitBundlesInScoreboardDelta(EpilogueBS.getBottom().Bundles, Top, Cycle,
+                               HR.get());
+
+  DEBUG_LOOPAWARE(dbgs() << "Epilogue scoreboard\n"; Top.dump());
+
+  // Use scoreboard comparison to calculate the number of nops
+  while (Bottom.conflict(Top, Depth)) {
+    Bottom.advance();
+    NopCounter++;
+  }
+
+  DEBUG_LOOPAWARE(dbgs() << "Resource conflict avoidance between"
+                         << " loop: " << *LoopMBB
+                         << " And epilogue: " << EpilogueMBB << " Requires "
+                         << NopCounter << " Nops\n");
+
+  return NopCounter;
+}
+
 void InterBlockEdges::addNode(MachineInstr *MI) {
   if (auto Index = DDG.initSUnit(*MI)) {
     IndexMap &TheMap = Boundary ? SuccMap : PredMap;
@@ -461,12 +639,6 @@ void BlockState::classify() {
   // construction.
 }
 
-void dumpInterBlock(const InterBlockEdges &Edges) {
-  for (const SUnit &SU : Edges) {
-    dbgs() << "SU" << SU.NodeNum << ": " << *SU.getInstr();
-  }
-}
-
 void BlockState::initInterBlock(const MachineSchedContext &Context) {
   BoundaryEdges = std::make_unique<InterBlockEdges>(Context);
 

@@ -257,6 +257,17 @@ class InterBlockScheduling {
   /// returns true if converged
   bool updateFixPoint(BlockState &BS);
 
+  /// Calculate the number of cycles that are needed to respect
+  /// latencies related to the loop whose the epilogue is associated
+  int getCyclesToRespectTiming(const BlockState &EpilogueBS,
+                               const BlockState &LoopBS) const;
+
+  /// Calculate the number of cycles that are needed to avoid resource
+  /// conflicts between loop and epilogue
+  int getCyclesToAvoidResourceConflicts(int ExistingLatency,
+                                        const BlockState &EpilogueBS,
+                                        const BlockState &LoopBS) const;
+
   BlockState *CurrentBlock = nullptr;
 
 public:

@@ -153,10 +153,6 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
 ; ASM-NEXT:    add r1, r1, #33; vmac cm7, cm7, x5, x6, r4 // Delay Slot 1
 ; ASM-NEXT:  // %bb.3: // %outer.loop.latch
 ; ASM-NEXT:    // in Loop: Header=BB0_1 Depth=1
-; ASM-NEXT:    nopa ; nopb ; nopx
-; ASM-NEXT:    nop
-; ASM-NEXT:    nop
-; ASM-NEXT:    nop
 ; ASM-NEXT:    vst.srs.s16.s32 bmh0, s2, [p3, #32]
 ; ASM-NEXT:    vst.srs.s16.s32 bml0, s2, [p3], #64
 ; ASM-NEXT:    vst.srs.s16.s32 bmh1, s2, [p3, #32]; mov m2, r31