Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add InterBlock loop/epilogue analysis #51

Merged
merged 4 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,10 @@ namespace {
const constexpr unsigned NumDelaySlots = 5;
} // namespace

unsigned AIEBaseInstrInfo::getNumDelaySlots(const MachineInstr &MI) const {
return MI.hasDelaySlot() ? NumDelaySlots : 0;
unsigned
AIEBaseInstrInfo::getNumDelaySlots(const MachineInstr &MI,
MachineInstr::QueryType Query) const {
return MI.hasDelaySlot(Query) ? NumDelaySlots : 0;
}

unsigned
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Target/AIE/AIEBaseInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,10 @@ struct AIEBaseInstrInfo : public TargetInstrInfo {

/// Returns the number of delay slots that this instruction requires.
/// This might be 0
virtual unsigned getNumDelaySlots(const MachineInstr &MI) const;
virtual unsigned
getNumDelaySlots(const MachineInstr &MI,
MachineInstr::QueryType Query =
MachineInstr::QueryType::AnyInBundle) const;

/// Returns the number of delay slots that should be reserved, i.e.
/// not filled in by the scheduler.
Expand Down
21 changes: 15 additions & 6 deletions llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ class RegionEndEdges : public ScheduleDAGMutation {
}
}
void apply(ScheduleDAGInstrs *DAG) override {
AIE::MaxLatencyFinder MaxLatency(static_cast<AIEScheduleDAGMI *>(DAG));
AIE::MaxLatencyFinder MaxLatency(DAG);

// Default edges to ExitSU are conservative, and can't be shrunk.
// We really should know what we're doing here, so just remove and
Expand All @@ -224,7 +224,12 @@ class RegionEndEdges : public ScheduleDAGMutation {

SDep ExitDep(&SU, SDep::Artificial);

unsigned DelaySlots = TII->getNumDelaySlots(MI);
// By using IgnoreBundle, we can safely apply this mutation to already
// bundled instructions without causing misclassification of instructions
// that are bundled with control flow ones. Otherwise, the assertion
// below can be triggered for correct cases.
unsigned DelaySlots =
TII->getNumDelaySlots(MI, MachineInstr::QueryType::IgnoreBundle);
unsigned EdgeLatency = !DelaySlots && UserSetLatencyMargin
? UserLatencyMargin
: MaxLatency(MI);
Expand Down Expand Up @@ -355,12 +360,15 @@ class PropagateIncomingLatencies : public ScheduleDAGMutation {
class MemoryEdges : public ScheduleDAGMutation {
void apply(ScheduleDAGInstrs *DAG) override {
const auto *TII = static_cast<const AIEBaseInstrInfo *>(DAG->TII);

// Query individual instruction behavior. This is because we might create
// dependencies with already-scheduled blocks where Bundles have been
// created.
const auto QueryType = MachineInstr::QueryType::IgnoreBundle;
// Run over all instructions that may load or store, and correct the
// latencies for all their memory dependencies.
for (SUnit &SU : DAG->SUnits) {
MachineInstr &MI = *SU.getInstr();
if (!MI.mayLoadOrStore()) {
if (!MI.mayLoadOrStore(QueryType)) {
continue;
}

Expand All @@ -369,13 +377,14 @@ class MemoryEdges : public ScheduleDAGMutation {

// Ignore non-memory dependencies. Locks or other instructions with side
// effects aren't handled with MemInstrItinData itineraries.
if (!PredEdge.isNormalMemoryOrBarrier() || !SrcMI.mayLoadOrStore()) {
if (!PredEdge.isNormalMemoryOrBarrier() ||
!SrcMI.mayLoadOrStore(QueryType)) {
continue;
}

// Ignore Load-Load (RAR) dependencies.
// TODO: Those should probably be removed altogether.
if (!SrcMI.mayStore() && !MI.mayStore()) {
if (!SrcMI.mayStore(QueryType) && !MI.mayStore(QueryType)) {
continue;
}

Expand Down
220 changes: 196 additions & 24 deletions llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,58 @@ static cl::opt<bool>
LoopAware("aie-loop-aware", cl::init(true),
cl::desc("[AIE] Schedule single block loops iteratively"));

static cl::opt<bool> LoopEpilogueAnalysis(
"aie-loop-epilogue-analysis", cl::init(true),
cl::desc("[AIE] Perform Loop/Epilogue analysis with loop scheduling"));

namespace llvm::AIE {

void dumpInterBlock(const InterBlockEdges &Edges) {
for (const SUnit &SU : Edges) {
dbgs() << "SU" << SU.NodeNum << ": " << *SU.getInstr();
}
}

void emitBundlesInScoreboard(const std::vector<MachineBundle> &Bundles,
ResourceScoreboard<FuncUnitWrapper> &Scoreboard,
AIEHazardRecognizer *HR) {

const int TotalBundles = Bundles.size();
const int AmountToEmit = std::min(TotalBundles, HR->getConflictHorizon());
// Do not emit more than the specified by the conflict horizon. More
// then this will not cause conflicts.
for (int i = TotalBundles - AmountToEmit; i < TotalBundles; i++) {
for (MachineInstr *MI : Bundles[i].getInstrs())
HR->emitInScoreboard(Scoreboard, MI->getDesc(), 0);

Scoreboard.advance();
}
}

void emitBundlesInScoreboardDelta(
const std::vector<MachineBundle> &Bundles,
ResourceScoreboard<FuncUnitWrapper> &Scoreboard, int &Delta,
AIEHazardRecognizer *HR) {

for (auto &Bundle : Bundles) {
// We don't need to replay more instructions, because we exhausted the
// scoreboard.
if (Delta >= 0)
break;

for (MachineInstr *MI : Bundle.getInstrs())
HR->emitInScoreboard(Scoreboard, MI->getDesc(), Delta);

Delta++;
}
}

MachineBasicBlock *getSinglePredecessor(const MachineBasicBlock &MBB) {
assert(MBB.pred_size() == 1 && "MBB contains more than 1 predecessor");
MachineBasicBlock *SinglePredMBB = *MBB.predecessors().begin();
return SinglePredMBB;
}

InterBlockScheduling::InterBlockScheduling(const MachineSchedContext *C,
bool InterBlock)
: Context(C), InterBlockScoreboard(InterBlock) {}
Expand Down Expand Up @@ -118,12 +168,8 @@ bool InterBlockScheduling::resourcesConverged(BlockState &BS) const {
ResourceScoreboard<FuncUnitWrapper> Bottom;
Bottom.reset(Depth);

for (auto &Bundle : BS.getBottom().Bundles) {
for (MachineInstr *MI : Bundle.getInstrs()) {
HR->emitInScoreboard(Bottom, MI->getDesc(), 0);
}
Bottom.advance();
}
emitBundlesInScoreboard(BS.getBottom().Bundles, Bottom, HR.get());

DEBUG_LOOPAWARE(dbgs() << "Bottom scoreboard\n"; Bottom.dump());
// We have two successors, the loop itself and the epilogue
assert(BS.TheBlock->succ_size() == 2);
Expand All @@ -138,15 +184,9 @@ bool InterBlockScheduling::resourcesConverged(BlockState &BS) const {
ResourceScoreboard<FuncUnitWrapper> Top;
Top.reset(Depth);
int Cycle = -Depth;
for (auto &Bundle : BS.getBottom().Bundles) {
if (Cycle >= 0) {
break;
}
for (MachineInstr *MI : Bundle.getInstrs()) {
HR->emitInScoreboard(Top, MI->getDesc(), Cycle);
}
Cycle++;
}

emitBundlesInScoreboardDelta(BS.getBottom().Bundles, Top, Cycle, HR.get());

DEBUG_LOOPAWARE(dbgs() << "Top scoreboard\n"; Top.dump());
if (Bottom.conflict(Top, Depth)) {
return false;
Expand Down Expand Up @@ -369,14 +409,152 @@ int InterBlockScheduling::getNumEntryNops(const BlockState &BS) const {
}
const MachineBasicBlock &BB = *BS.TheBlock;
assert(BB.pred_size() == 1);
MachineBasicBlock *Loop = *BB.predecessors().begin();
MachineBasicBlock *Loop = getSinglePredecessor(BB);
auto &LBS = getBlockState(Loop);

// TODO: we can do better by doing full interblock analysis
// between BS and LBS
// We can only analyze non-empty epilogue blocks because we need
// to build a DDG, which is not possible.
// For empty ones, we need to be conservative because we are not aware of
// content of epilogues' successor.
if (LoopEpilogueAnalysis && BB.size() > 0) {
int ExistingLatency = getCyclesToRespectTiming(BS, LBS);
// Start the next step only after clearing latencies.
return getCyclesToAvoidResourceConflicts(ExistingLatency, BS, LBS);
}

return LBS.getSafetyMargin();
}

int InterBlockScheduling::getCyclesToRespectTiming(
const BlockState &EpilogueBS, const BlockState &LoopBS) const {

const MachineBasicBlock &EpilogueMBB = *EpilogueBS.TheBlock;
const MachineBasicBlock *LoopMBB = getSinglePredecessor(EpilogueMBB);

DEBUG_LOOPAWARE(dbgs() << "** Loop/Epilogue-carried latency dependencies:"
<< " Original Loop " << *LoopMBB
<< " Original Epilogue " << EpilogueMBB << "\n");

InterBlockEdges Edges(*Context);
std::map<const MachineInstr *, int> DistancesFromLoopEntry;
int DistFromLoopEntry = 0;
int EntryNops = 0;

auto AddRegionToEdges = [&](const Region &R) {
for (auto &Bundle : R.Bundles) {
for (MachineInstr *MI : Bundle.getInstrs()) {
DistancesFromLoopEntry[MI] = DistFromLoopEntry;
Edges.addNode(MI);
}
++DistFromLoopEntry;
}
};

// Construction of the superblock containing Loop+Epilogue
// First part is the loop
AddRegionToEdges(LoopBS.getBottom());
Edges.markBoundary();
// Second part is the epilogue itself
AddRegionToEdges(EpilogueBS.getTop());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Maybe also add the ExitSU node of EpilogueBS to the map of distances so we can avoid the if (Succ->isBoundaryNode()) corner case in the loop below.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the current implementation, this mapping maps instructions to depths, in this way we need to change the mapping logic as well.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I meant is something like: DistancesFromLoopEntry[EpilogueBS.getTop().getExitSU()] = DistFromLoopEntry;, this way there is no if (Succ->isBoundaryNode()) special casing in the loop below.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, I simplified a bit more the code.

Edges.buildEdges();

DEBUG_LOOPAWARE(dumpInterBlock(Edges));
// Check cross-boundary latencies.
int Height = 1;
for (auto &Bundle : reverse(LoopBS.getBottom().Bundles)) {
for (auto *PreBoundaryMI : Bundle.getInstrs()) {
const SUnit *Pred = Edges.getPreBoundaryNode(PreBoundaryMI);

for (auto &SDep : Pred->Succs) {
auto *Succ = SDep.getSUnit();

if (!Edges.isPostBoundaryNode(Succ))
continue;

const MachineInstr *PostBoundaryMI = Succ->getInstr();

const int PostBoundOrExitDist =
(PostBoundaryMI != nullptr)
? DistancesFromLoopEntry[PostBoundaryMI]
// When getInstr returns nullptr, we reached
// ExitSU. We can consider the DistFromLoopEntry as
// depth of the ExitSU.
: DistFromLoopEntry;

const int Latency = SDep.getSignedLatency();
const int Distance =
PostBoundOrExitDist - DistancesFromLoopEntry[PreBoundaryMI];

DEBUG_LOOPAWARE(dbgs() << "Data dependency found:\n"
<< " Loop instruction SU: " << *PreBoundaryMI);
DEBUG_LOOPAWARE(dbgs() << " Epilogue instruction: ";
if (PostBoundaryMI) PostBoundaryMI->dump();
else dbgs() << "nullptr (ExitSU)";);
DEBUG_LOOPAWARE(dbgs() << "\n Latency: " << Latency
<< "\n Distance: " << Distance << "\n");

EntryNops = std::max(EntryNops, Latency - Distance);
}
}
if (++Height > HR->getConflictHorizon()) {
break;
}
}

return EntryNops;
}

int InterBlockScheduling::getCyclesToAvoidResourceConflicts(
int ExistingLatency, const BlockState &EpilogueBS,
const BlockState &LoopBS) const {

const MachineBasicBlock &EpilogueMBB = *EpilogueBS.TheBlock;
MachineBasicBlock *LoopMBB = LoopBS.TheBlock;
int Depth = HR->getMaxLookAhead();
ResourceScoreboard<FuncUnitWrapper> Bottom;
Bottom.reset(Depth);

DEBUG_LOOPAWARE(dbgs() << "* Loop/Epilogue-carried resource conflicts:"
<< " Original Loop " << *LoopMBB << " Original Epilog "
<< EpilogueMBB << "\n");

emitBundlesInScoreboard(LoopBS.getBottom().Bundles, Bottom, HR.get());

// We know how many latency cycles we need to respect, and we can advance
// the scoreboard to the first possible cycle that can accommodate another
// instruction and start the resource verification from this point, tracking
// the number of NOPS.
int NopCounter = 0;
for (NopCounter = 0; NopCounter < ExistingLatency; ++NopCounter)
andcarminati marked this conversation as resolved.
Show resolved Hide resolved
Bottom.advance();

DEBUG_LOOPAWARE(dbgs() << "Loop scoreboard\n"; Bottom.dump());

ResourceScoreboard<FuncUnitWrapper> Top;
Top.reset(Depth);
int Cycle = -Depth;

auto Bundles = EpilogueBS.getBottom().Bundles;

emitBundlesInScoreboardDelta(EpilogueBS.getBottom().Bundles, Top, Cycle,
HR.get());

DEBUG_LOOPAWARE(dbgs() << "Epilogue scoreboard\n"; Top.dump());

// Use scoreboard comparison to calculate the number of nops
while (Bottom.conflict(Top, Depth)) {
Bottom.advance();
NopCounter++;
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: That looks a lot like AIEPostRASchedStrategy::handleRegionConflicts, maybe there's code we can share.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @gbossu, I think it could be a bit complicated, because handleRegionConflicts uses two hazard recognizers, while we use just one, plus scoreboard comparison. I am afraid that a refactor could create a more confusing code.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The implementation is different, but ultimately the goal is the same, we want to insert NOPs until there is no resource hazard. We can do that in a follow-up PR.


DEBUG_LOOPAWARE(dbgs() << "Resource conflict avoidance between"
<< " loop: " << *LoopMBB
<< " And epilogue: " << EpilogueMBB << " Requires "
<< NopCounter << " Nops\n");

return NopCounter;
}

void InterBlockEdges::addNode(MachineInstr *MI) {
if (auto Index = DDG.initSUnit(*MI)) {
IndexMap &TheMap = Boundary ? SuccMap : PredMap;
Expand Down Expand Up @@ -461,12 +639,6 @@ void BlockState::classify() {
// construction.
}

void dumpInterBlock(const InterBlockEdges &Edges) {
for (const SUnit &SU : Edges) {
dbgs() << "SU" << SU.NodeNum << ": " << *SU.getInstr();
}
}

void BlockState::initInterBlock(const MachineSchedContext &Context) {
BoundaryEdges = std::make_unique<InterBlockEdges>(Context);

Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/AIE/AIEInterBlockScheduling.h
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,17 @@ class InterBlockScheduling {
/// returns true if converged
bool updateFixPoint(BlockState &BS);

/// Calculate the number of cycles that are needed to respect
/// latencies related to the loop whose the epilogue is associated
int getCyclesToRespectTiming(const BlockState &EpilogueBS,
const BlockState &LoopBS) const;

/// Calculate the number of cycles that are needed to avoid resource
/// conflicts between loop and epilogue
int getCyclesToAvoidResourceConflicts(int ExistingLatency,
const BlockState &EpilogueBS,
const BlockState &LoopBS) const;

BlockState *CurrentBlock = nullptr;

public:
Expand Down
4 changes: 0 additions & 4 deletions llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll
Original file line number Diff line number Diff line change
Expand Up @@ -153,10 +153,6 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
; ASM-NEXT: add r1, r1, #33; vmac cm7, cm7, x5, x6, r4 // Delay Slot 1
; ASM-NEXT: // %bb.3: // %outer.loop.latch
; ASM-NEXT: // in Loop: Header=BB0_1 Depth=1
; ASM-NEXT: nopa ; nopb ; nopx
; ASM-NEXT: nop
; ASM-NEXT: nop
; ASM-NEXT: nop
; ASM-NEXT: vst.srs.s16.s32 bmh0, s2, [p3, #32]
; ASM-NEXT: vst.srs.s16.s32 bml0, s2, [p3], #64
; ASM-NEXT: vst.srs.s16.s32 bmh1, s2, [p3, #32]; mov m2, r31
Expand Down
Loading
Loading