Skip to content

Commit

Permalink
[AIEX] Create SUs for top-fixed instructions
Browse files Browse the repository at this point in the history
Also add all related dependencies to have safety margins.
  • Loading branch information
andcarminati committed Dec 10, 2024
1 parent c1dc7df commit 04a2805
Show file tree
Hide file tree
Showing 18 changed files with 417 additions and 54 deletions.
142 changes: 142 additions & 0 deletions llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "AIEMachineScheduler.h"
#include "AIEMaxLatencyFinder.h"
#include "AIESubtarget.h"
#include "Utils/AIELoopUtils.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/ScheduleDAG.h"
Expand Down Expand Up @@ -314,20 +315,102 @@ class RegionEndEdges : public ScheduleDAGMutation {
/// "fixed" SUnits.
class EmitFixedSUnits : public ScheduleDAGMutation {
public:
struct RegAvailabilityTracker {
using fixed_iterator = MachineBasicBlock::iterator;
const InstrItineraryData *InstrItins;
const TargetRegisterInfo *TRI;
std::map<MCRegister, unsigned> RegisterToCycle;

RegAvailabilityTracker(const InstrItineraryData *Itins,
const TargetRegisterInfo *TRI)
: InstrItins(Itins), TRI(TRI) {};

void updateRegisterAvailability(Register Reg, unsigned Latency) {
for (MCRegAliasIterator Ali(Reg, TRI, true); Ali.isValid(); ++Ali) {
auto RegCycle = RegisterToCycle.find(*Ali);
if (RegCycle == RegisterToCycle.end()) {
RegisterToCycle[*Ali] = Latency;
} else {
RegisterToCycle[*Ali] = std::max(RegCycle->second, Latency);
}
}
}

// This function calculates the cycles in which each register will be last
// materialized, taking into account a specified sequence of timed bundles.
// PastTheEndCycles is utilized to project the availability cycle to the
// subsequent region.
void computeAvailabilityCycles(ArrayRef<AIE::MachineBundle> Bundles,
bool PastTheEndCycles) {
int Cycle = 0;
const int TotalCycles = Bundles.size();
for (const auto &Bundle : Bundles) {
for (MachineInstr *SrcMI : Bundle.getInstrs()) {
for (unsigned OpNum = 0; OpNum < SrcMI->getNumOperands(); OpNum++) {
unsigned SrcClass = SrcMI->getDesc().getSchedClass();
const MachineOperand &MO = SrcMI->getOperand(OpNum);
if (!MO.isReg() || !MO.isDef())
continue;
std::optional<unsigned> OptSrcCycle =
InstrItins->getOperandCycle(SrcClass, OpNum);
assert(OptSrcCycle);
int Latency = *OptSrcCycle;
int MaterializationCycle = 0;
if (PastTheEndCycles) {
int RemainingCycles = TotalCycles - Cycle - 1;
MaterializationCycle = std::max(Latency - RemainingCycles - 1, 0);
} else {
MaterializationCycle = Cycle + Latency;
}
updateRegisterAvailability(MO.getReg(), MaterializationCycle);
}
}
Cycle++;
}
}

unsigned getMaxSrcOperandLatency(const MachineInstr &MI) const {
unsigned MaxLatency = 0;
for (const MachineOperand &MO : MI.all_uses()) {
if (!MO.isReg() || !MO.isUse())
continue;
for (MCRegAliasIterator Ali(MO.getReg(), TRI, true); Ali.isValid();
++Ali) {
auto RegCycle = RegisterToCycle.find(*Ali);
if (RegCycle != RegisterToCycle.end()) {
MaxLatency = std::max(RegCycle->second, MaxLatency);
}
}
}
return MaxLatency;
}
};

void apply(ScheduleDAGInstrs *DAG) override {
AIEPostRASchedStrategy *Scheduler =
static_cast<AIEScheduleDAGMI *>(DAG)->getSchedImpl();
auto *TII = static_cast<const AIEBaseInstrInfo *>(DAG->TII);
auto *ItinData = DAG->MF.getSubtarget().getInstrItineraryData();
const TargetRegisterInfo *TRI = DAG->MF.getSubtarget().getRegisterInfo();
const BlockState &BS =
Scheduler->getInterBlock().getBlockState(DAG->getBB());
const Region &CurRegion = BS.getCurrentRegion();
RegAvailabilityTracker RAT{ItinData, TRI};

// First, create SUnits for all "fixed" instructions
// Those will be chained from/to the EntrySU/ExitSU to ensure they are
// placed in the correct cycle. The scheduler will enforce that these fixed
// SUnits get placed exactly at their depth (for the Top zone) or height
// (for the Bot zone).
SUnit *Pred = &DAG->EntrySU;
for (MachineInstr &MI : CurRegion.top_fixed_instrs()) {
SUnit &FixedSU = Scheduler->addFixedSUnit(MI, /*IsTop=*/true);
SDep Dep(Pred, SDep::Artificial);
Dep.setLatency(Pred == &DAG->EntrySU ? 0 : 1);
FixedSU.addPred(Dep);
Pred = &FixedSU;
}

SUnit *Succ = &DAG->ExitSU;
for (MachineInstr &MI : reverse(CurRegion.bot_fixed_instrs())) {
SUnit &FixedSU = Scheduler->addFixedSUnit(MI, /*IsTop=*/false);
Expand Down Expand Up @@ -359,6 +442,65 @@ class EmitFixedSUnits : public ScheduleDAGMutation {
AIE::maxLatency(&MI, *TII, *ItinData, /*IncludeStages=*/true));
FixedDepSU->addPred(Dep, /*Required=*/true);
}

// We only need to focus on top-fixed instructions when there is an Epilog
// block.
if (BS.Kind != BlockType::Epilogue)
return;

MachineBasicBlock *Loop = AIELoopUtils::getLoopPredecessor(*DAG->getBB());
assert(Loop);
const BlockState &LBS = Scheduler->getInterBlock().getBlockState(Loop);
assert(LBS.Kind == BlockType::Loop);

if (!LBS.isPipelined())
return;

ArrayRef<AIE::MachineBundle> LoopTimedBundles = LBS.getTop().Bundles;
ArrayRef<AIE::MachineBundle> TopFixedBundles =
CurRegion.getTopFixedBundles();

RAT.computeAvailabilityCycles(TopFixedBundles, /*PastTheEndCycles*/ false);
// It is more cost-effective to reuse the RAT to establish individual safety
// margins between the pipelined loop and the free instructions. This
// approach allows us to manage all dependencies related to EntrySU in one
// centralized location. While it is possible to implement this as a
// separate mutator, doing so could be costly, as it would prevent the
// creation of multiple edges from EntrySU to each free instruction that
// depends on both timed regions (TopFixed and LoopTimed).
RAT.computeAvailabilityCycles(LoopTimedBundles, /*PastTheEndCycles*/ true);

auto IsNotTopFixedSU = [Scheduler](const SUnit &SU) {
return !Scheduler->isFixedSU(SU, true);
};

// Establish dependencies for each non top-fixed sched. unit by taking into
// account the availability cycle of each operand.
for (SUnit &SU : make_filter_range(DAG->SUnits, IsNotTopFixedSU)) {
const MachineInstr &MI = *SU.getInstr();
if (const unsigned Latency = RAT.getMaxSrcOperandLatency(MI)) {
SDep Dep(&DAG->EntrySU, SDep::Artificial);
Dep.setLatency(Latency);
SU.addPred(Dep, /*Required=*/true);
}
}

auto IsTopFixedSU = [Scheduler](const SUnit &SU) {
return Scheduler->isFixedSU(SU, true);
};

// Establish dependencies to ExitSU for each top-fixed sched. unit by taking
// into account MaxLatency.
for (SUnit &FixedSU : make_filter_range(DAG->SUnits, IsTopFixedSU)) {
const MachineInstr &MI = *FixedSU.getInstr();
if (const unsigned Latency = RAT.getMaxSrcOperandLatency(MI)) {
SDep Dep(&FixedSU, SDep::Artificial);
int latency =
AIE::maxLatency(&MI, *TII, *ItinData, /*IncludeStages=*/true);
Dep.setLatency(latency);
DAG->ExitSU.addPred(Dep, /*Required=*/true);
}
}
}
};

Expand Down
18 changes: 2 additions & 16 deletions llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,20 +204,6 @@ MachineInstr *checkResourceConflictsTopDown(
return ConflictMI;
}

MachineBasicBlock *getLoopPredecessor(const MachineBasicBlock &MBB) {
if (MBB.pred_size() == 1) {
// if we have only one, it must be the loop
return *MBB.predecessors().begin();
}
// Otherwise, the loop is the fallthrough predecessor by construction
for (auto *Pred : MBB.predecessors()) {
if (Pred->isLayoutSuccessor(&MBB)) {
return Pred;
}
}
return nullptr;
}

InterBlockScheduling::InterBlockScheduling(const MachineSchedContext *C,
bool InterBlock)
: Context(C), InterBlockScoreboard(InterBlock) {}
Expand Down Expand Up @@ -809,7 +795,7 @@ void InterBlockScheduling::emitInterBlockSafetyMargin(
}

MachineBasicBlock *BB = BS.TheBlock;
MachineBasicBlock *Loop = getLoopPredecessor(*BB);
MachineBasicBlock *Loop = AIELoopUtils::getLoopPredecessor(*BB);
assert(Loop);
const BlockState &LBS = getBlockState(Loop);

Expand All @@ -833,7 +819,7 @@ void InterBlockScheduling::emitInterBlockTop(BlockState &BS) {
}

MachineBasicBlock *BB = BS.TheBlock;
MachineBasicBlock *Loop = getLoopPredecessor(*BB);
MachineBasicBlock *Loop = AIELoopUtils::getLoopPredecessor(*BB);
assert(Loop);
const BlockState &LBS = getBlockState(Loop);

Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/AIE/AIEMaxLatencyFinder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,21 @@ static cl::opt<bool>
// operand and the memory latency. Include the stage latency if requested.
int maxLatency(const MachineInstr *MI, const AIEBaseInstrInfo &InstrInfo,
const InstrItineraryData &Itineraries, bool IncludeStages) {

// If we have a Bundle, query maxLatency for each bundled instruction.
if (MI->getOpcode() == TargetOpcode::BUNDLE) {
int BundleLatency = 0;
MachineBasicBlock::const_instr_iterator I = MI->getIterator();
MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
while (++I != E && I->isBundledWithPred()) {
assert(!I->isBundle() && "No nested bundle!");
BundleLatency =
std::max(maxLatency(&(*I), InstrInfo, Itineraries, IncludeStages),
BundleLatency);
}
return BundleLatency;
}

int Latency = 0;
unsigned SrcClass = MI->getDesc().getSchedClass();
for (unsigned I = 0;; I++) {
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/AIE/Utils/AIELoopUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,4 +116,18 @@ bool isSingleMBBLoop(const MachineBasicBlock *MBB) {
return NumLoopEdges == 1 && NumExitEdges == 1;
}

MachineBasicBlock *getLoopPredecessor(const MachineBasicBlock &MBB) {
if (MBB.pred_size() == 1) {
// if we have only one, it must be the loop
return *MBB.predecessors().begin();
}
// Otherwise, the loop is the fallthrough predecessor by construction
for (auto *Pred : MBB.predecessors()) {
if (Pred->isLayoutSuccessor(&MBB)) {
return Pred;
}
}
return nullptr;
}

} // namespace llvm::AIELoopUtils
4 changes: 4 additions & 0 deletions llvm/lib/Target/AIE/Utils/AIELoopUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ getSingleBlockLoopMBBs(const MachineFunction &MF);
/// Check if this block is a single block loop.
bool isSingleMBBLoop(const MachineBasicBlock *MBB);

/// Considering that MBB has a single predecessor that is a loop
/// and also layout predecessor, return it.
MachineBasicBlock *getLoopPredecessor(const MachineBasicBlock &MBB);

} // namespace llvm::AIELoopUtils

#endif
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
; RUN: llc -O2 -mtriple=aie2 --enable-pipeliner=1 --enable-aie-hardware-loops=false \
; RUN: --enable-aie-zero-overhead-loops=false %s -o - | FileCheck %s --check-prefix=DCL
; RUN: llc -O2 -mtriple=aie2 --enable-pipeliner=1 %s -o - | FileCheck %s --check-prefix=ZOL
; RUN: llc -O2 -mtriple=aie2 --enable-pipeliner=0 %s -o - --debug-only=machine-scheduler \
; RUN: 2>&1 | %imisched -d - \
; RUN: llc -O2 -mtriple=aie2 --enable-pipeliner=0 --aie-postpipeliner-maxii=0 %s -o \
; RUN: - --debug-only=machine-scheduler 2>&1 | %imisched -d - \
; RUN: | FileCheck %s --check-prefix=SCHED-DUMP

; Variation of the already existent test, but enabling SWP under different flavors:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@
; CHECK-NEXT: .L_LEnd0:
; CHECK-NEXT: nopb ; nopa ; st r1, [p0], #4; add r1, r1, #1; nopm ; nopv
; CHECK-NEXT: // %bb.3: // %for.cond.cleanup
; CHECK-NEXT: nopb ; nopa ; st r1, [p0], #4; nopxm ; nopv
; CHECK-NEXT: nopa ; nopb ; nopxm ; st r1, [p0], #4
; CHECK-NEXT: nop
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup
; CHECK-NEXT: nopa ; ret lr
Expand Down
38 changes: 16 additions & 22 deletions llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir
Original file line number Diff line number Diff line change
Expand Up @@ -118,44 +118,38 @@
; CHECK-NEXT: vldb wh7, [p1], #32; nopa ; nops ; nopx ; mov r6, p0; vmac cm3, cm3, x3, x10, r4
; CHECK-NEXT: // %bb.3: // %outer.loop.latch
; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: nopa ; and r5, r6, r9; vshift.align x4, x4, s1, x6, r0; vmac cm4, cm4, x5, x10, r4
; CHECK-NEXT: vmac cm5, cm5, x9, x7, r4
; CHECK-NEXT: add r0, r5, #33; vshift.align x2, x2, s1, x8, r0; vmac cm6, cm6, x1, x7, r4
; CHECK-NEXT: nopb ; nopa ; nops ; and r5, r6, r9; vshift.align x4, x4, s1, x6, r0; vmac cm4, cm4, x5, x10, r4
; CHECK-NEXT: nopb ; nopa ; nops ; add r7, r7, #-1; mov s3, r6; vmac cm5, cm5, x9, x7, r4
; CHECK-NEXT: nopa ; add r0, r5, #33; vshift.align x2, x2, s1, x8, r0; vmac cm6, cm6, x1, x7, r4
; CHECK-NEXT: vshuffle x9, x4, x2, r2; vmac cm7, cm7, x3, x7, r4
; CHECK-NEXT: vshuffle x1, x9, x0, r8; vmac cm0, cm0, x5, x7, r4
; CHECK-NEXT: vshuffle x3, x4, x2, r3; vmac cm1, cm1, x9, x10, r4
; CHECK-NEXT: vshuffle x5, x3, x0, r8; vmac cm2, cm2, x1, x10, r4
; CHECK-NEXT: vmac cm3, cm3, x3, x10, r4
; CHECK-NEXT: vmac cm4, cm4, x5, x10, r4
; CHECK-NEXT: vmac cm5, cm5, x9, x7, r4
; CHECK-NEXT: vmac cm6, cm6, x1, x7, r4
; CHECK-NEXT: vmac cm7, cm7, x3, x7, r4
; CHECK-NEXT: vmac cm0, cm0, x5, x7, r4
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: vst.srs.s16.s32 bmh1, s2, [p3, #32]; mov s3, r6
; CHECK-NEXT: vst.srs.s16.s32 bml1, s3, [p3], #64
; CHECK-NEXT: vst.srs.s16.s32 bmh2, s3, [p3, #32]
; CHECK-NEXT: mov dc5, r26; vmac cm3, cm3, x3, x10, r4
; CHECK-NEXT: mov dn5, r27; vmac cm4, cm4, x5, x10, r4
; CHECK-NEXT: mov dj5, r28; vmac cm5, cm5, x9, x7, r4
; CHECK-NEXT: vst.srs.s16.s32 bmh1, s2, [p3, #32]; mov m1, r10; vmac cm6, cm6, x1, x7, r4
; CHECK-NEXT: vst.srs.s16.s32 bml1, s3, [p3], #64; mov m2, r13; vmac cm7, cm7, x3, x7, r4
; CHECK-NEXT: padda.3d [p1], d2; vst.srs.s16.s32 bmh2, s3, [p3, #32]; mov m3, r14; vmac cm0, cm0, x5, x7, r4
; CHECK-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m4
; CHECK-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32]
; CHECK-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64
; CHECK-NEXT: vst.srs.s16.s32 bmh4, s3, [p3, #32]
; CHECK-NEXT: vst.srs.s16.s32 bml4, s3, [p3], m7
; CHECK-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]
; CHECK-NEXT: vst.srs.s16.s32 bml5, s3, [p3], #64
; CHECK-NEXT: vst.srs.s16.s32 bmh6, s3, [p3, #32]; mov dc5, r26
; CHECK-NEXT: vst.srs.s16.s32 bml6, s3, [p3], m4; mov dn5, r27
; CHECK-NEXT: vst.srs.s16.s32 bmh7, s3, [p3, #32]; mov dj5, r28
; CHECK-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64; mov m1, r10
; CHECK-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32]; mov m2, r13
; CHECK-NEXT: vst.srs.s16.s32 bmh6, s3, [p3, #32]
; CHECK-NEXT: vst.srs.s16.s32 bml6, s3, [p3], m4
; CHECK-NEXT: vst.srs.s16.s32 bmh7, s3, [p3, #32]
; CHECK-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64
; CHECK-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32]
; CHECK-NEXT: vst.2d.srs.s16.s32 bml0, s3, [p3], d5; mov dj5, r11
; CHECK-NEXT: add r7, r7, #-1; mov dn5, r12
; CHECK-NEXT: mov dn5, r12
; CHECK-NEXT: jnz r7, #.LBB0_1
; CHECK-NEXT: mov r26, dc5 // Delay Slot 5
; CHECK-NEXT: mov dc5, r25 // Delay Slot 4
; CHECK-NEXT: padda.3d [p0], d1; mov m1, r24 // Delay Slot 3
; CHECK-NEXT: padda.3d [p1], d2; paddb [p2], m1; mov m3, r14 // Delay Slot 2
; CHECK-NEXT: paddb [p2], m1 // Delay Slot 2
; CHECK-NEXT: padda.3d [p2], d3; mov r25, dc5 // Delay Slot 1
; CHECK-NEXT: // %bb.4: // %exitStub
; CHECK-NEXT: lda p7, [sp, #-32]; nopb ; nopxm // 4-byte Folded Reload
Expand Down
Loading

0 comments on commit 04a2805

Please sign in to comment.