diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp index cdd39e6732fa..355ebac441df 100644 --- a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp +++ b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp @@ -983,6 +983,15 @@ bool AIE2InstrInfo::isSchedBarrier(const MachineInstr &MI) const { MI.getOpcode() == AIE2::MOV_CNTR || isDelayedSchedBarrier(MI)); } +bool AIE2InstrInfo::isScalarMove(unsigned OpCode) const { + switch (OpCode) { + case AIE2::MOV_mv_scl: + return true; + default: + return false; + } +} + unsigned AIE2InstrInfo::getNumReservedDelaySlots(const MachineInstr &MI) const { return 0; } @@ -1419,6 +1428,11 @@ AIE2InstrInfo::getVExtractOpInfo(const MachineInstr &MI) const { unsigned AIE2InstrInfo::getMaxLoadStoreSize() const { return 256; } +std::vector AIE2InstrInfo::getDelayedScalarMoveOpcode() const { + return {AIE2::MOV_D1, AIE2::MOV_D2, AIE2::MOV_D3, + AIE2::MOV_D4, AIE2::MOV_D5, AIE2::MOV_D6}; +} + bool AIE2InstrInfo::canCombineWithLoadStore(const MachineInstr &MI) const { if (!isa(MI)) diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.h b/llvm/lib/Target/AIE/AIE2InstrInfo.h index 5a79a652feb9..3b7576bb0bcf 100644 --- a/llvm/lib/Target/AIE/AIE2InstrInfo.h +++ b/llvm/lib/Target/AIE/AIE2InstrInfo.h @@ -49,9 +49,12 @@ class AIE2InstrInfo : public AIE2GenInstrInfo { unsigned getGenericPadVectorOpcode() const override; unsigned getGenericUnpadVectorOpcode() const override; unsigned getCycleSeparatorOpcode() const override; + std::vector getDelayedScalarMoveOpcode() const override; + bool isLock(unsigned Opc) const override; bool isDelayedSchedBarrier(const MachineInstr &MI) const override; bool isSchedBarrier(const MachineInstr &MI) const override; + bool isScalarMove(unsigned OpCode) const override; virtual unsigned getNumReservedDelaySlots(const MachineInstr &MI) const override; diff --git a/llvm/lib/Target/AIE/AIE2RegisterInfo.cpp b/llvm/lib/Target/AIE/AIE2RegisterInfo.cpp index 595f997c560d..319dfaf5e595 100644 --- a/llvm/lib/Target/AIE/AIE2RegisterInfo.cpp +++ b/llvm/lib/Target/AIE/AIE2RegisterInfo.cpp @@ -141,6 +141,10 @@ bool AIE2RegisterInfo::isReservedStickyReg(MCRegister PhysReg) const { } } +bool AIE2RegisterInfo::isRTypeReg(Register Reg) const { + return Reg.isPhysical() && AIE2::eRRegClass.contains(Reg); +} + const uint32_t *AIE2RegisterInfo::getNoPreservedMask() const { return CSR_NoRegs_RegMask; } diff --git a/llvm/lib/Target/AIE/AIE2RegisterInfo.h b/llvm/lib/Target/AIE/AIE2RegisterInfo.h index fa170ea87413..321849575881 100644 --- a/llvm/lib/Target/AIE/AIE2RegisterInfo.h +++ b/llvm/lib/Target/AIE/AIE2RegisterInfo.h @@ -82,6 +82,7 @@ struct AIE2RegisterInfo : public AIE2GenRegisterInfo { getCoveringSubRegs(const TargetRegisterClass &RC) const override; bool isSimplifiableReservedReg(MCRegister PhysReg) const override; bool isReservedStickyReg(MCRegister PhysReg) const override; + bool isRTypeReg(Register Reg) const override; const TargetRegisterClass *get2DIteratorRegClass() const override { return &AIE2::eDRegClass; diff --git a/llvm/lib/Target/AIE/AIEAlternateDescriptors.h b/llvm/lib/Target/AIE/AIEAlternateDescriptors.h index 01673a348efd..e7ed347d1651 100644 --- a/llvm/lib/Target/AIE/AIEAlternateDescriptors.h +++ b/llvm/lib/Target/AIE/AIEAlternateDescriptors.h @@ -22,6 +22,9 @@ namespace llvm { +using MutateInstructionMap = + std::unordered_map>; using MIAltDescsMap = std::unordered_map; class AIEAlternateDescriptors { @@ -40,7 +43,11 @@ class AIEAlternateDescriptors { const AIEBaseSubtarget &STI = AIEBaseSubtarget::get(*MI->getMF()); const AIEBaseInstrInfo *TII = STI.getInstrInfo(); - AlternateDescs[MI] = &TII->get(AltInstOpcode); + setAlternateDescriptor(MI, &TII->get(AltInstOpcode)); + } + + void setAlternateDescriptor(MachineInstr *MI, const MCInstrDesc *AltDesc) { + AlternateDescs[MI] = AltDesc; } // Return the alternate descriptor for the given multi-opcode instruction. diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h index 51f103d422e1..d691df1d0751 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h +++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h @@ -126,6 +126,9 @@ struct AIEBaseInstrInfo : public TargetInstrInfo { /// Check whether this is a scheduling barrier virtual bool isSchedBarrier(const MachineInstr &) const { return false; } + /// Check whether OpCode is a scalar move instruction + virtual bool isScalarMove(unsigned OpCode) const { return false; } + /// Returns the number of delay slots that this instruction requires. /// This might be 0 virtual unsigned @@ -137,6 +140,12 @@ struct AIEBaseInstrInfo : public TargetInstrInfo { /// not filled in by the scheduler. virtual unsigned getNumReservedDelaySlots(const MachineInstr &MI) const; + /// Return Opcode for delayed scalar move insturction in increasing order of + /// delay + virtual std::vector getDelayedScalarMoveOpcode() const { + return std::vector(); + } + /// Check whether Opc represents a JNZ instruction. This is mainly for /// detecting a downcounting loop branch. virtual bool isJNZ(unsigned Opc) const { return false; } diff --git a/llvm/lib/Target/AIE/AIEBaseRegisterInfo.h b/llvm/lib/Target/AIE/AIEBaseRegisterInfo.h index 4d617d79f179..86d7d9043c17 100644 --- a/llvm/lib/Target/AIE/AIEBaseRegisterInfo.h +++ b/llvm/lib/Target/AIE/AIEBaseRegisterInfo.h @@ -58,6 +58,10 @@ struct AIEBaseRegisterInfo : public TargetRegisterInfo { virtual bool isReservedStickyReg(MCRegister PhysReg) const { llvm_unreachable("Target didn't implement isReservedStickyReg!"); } + + /// Check if Reg is part of the R-type register file + virtual bool isRTypeReg(Register Reg) const { return false; } + #if 0 /// Returns a BitVector of the intersection of GPR RegClass /// and CalleeSaved Registers diff --git a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp index 0e4fbe56ec73..e0e0327b1c88 100644 --- a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp +++ b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp @@ -108,6 +108,15 @@ FuncUnitWrapper &FuncUnitWrapper::operator|=(const FuncUnitWrapper &Other) { return *this; } +FuncUnitWrapper &FuncUnitWrapper::operator^=(const FuncUnitWrapper &Other) { + // XOR operation with the same FuncUnitWrapper will release resources. + Required ^= Other.Required; + Reserved ^= Other.Reserved; + Slots ^= Other.Slots; + MemoryBanks ^= Other.MemoryBanks; + return *this; +} + bool FuncUnitWrapper::conflict(const FuncUnitWrapper &Other) const { if ((Required & Other.Required) != 0 || (Slots & Other.Slots) != 0 || (MemoryBanks & Other.MemoryBanks) != 0 || @@ -447,6 +456,14 @@ auto toHazardType(bool Conflict) { } } // namespace +ScheduleHazardRecognizer::HazardType AIEHazardRecognizer::getHazardType( + const MCInstrDesc &Desc, MemoryBankBits MemoryBanks, + iterator_range MIOperands, + const MachineRegisterInfo &MRI, int DeltaCycles) { + return getHazardType(Scoreboard, Desc, MemoryBanks, MIOperands, MRI, + DeltaCycles); +} + // These functions interpret the itinerary, translating InstrStages // to ResourceCycles to apply. // We deviate from the standard ScoreboardHazardRecognizer by not @@ -470,10 +487,28 @@ ScheduleHazardRecognizer::HazardType AIEHazardRecognizer::getHazardType( FUDepthLimit)); } -bool AIEHazardRecognizer::checkConflict( +ConflictTypeBits AIEHazardRecognizer::checkConflict(MachineInstr &MI, + int DeltaCycles) { + assert(!TII->getFormatInterface()->getAlternateInstsOpcode(MI.getOpcode())); + return checkConflict(Scoreboard, MI, DeltaCycles); +} + +ConflictTypeBits AIEHazardRecognizer::checkConflict( const ResourceScoreboard &Scoreboard, MachineInstr &MI, int DeltaCycles) const { - const MCInstrDesc &Desc = MI.getDesc(); + assert(!TII->getFormatInterface()->getAlternateInstsOpcode(MI.getOpcode())); + return checkConflict(Scoreboard, MI, MI.getDesc(), DeltaCycles); +} + +ConflictTypeBits AIEHazardRecognizer::checkConflict(MachineInstr &MI, + const MCInstrDesc &Desc, + int DeltaCycles) { + return checkConflict(Scoreboard, MI, Desc, DeltaCycles); +} + +ConflictTypeBits AIEHazardRecognizer::checkConflict( + const ResourceScoreboard &Scoreboard, MachineInstr &MI, + const MCInstrDesc &Desc, int DeltaCycles) const { const unsigned SchedClass = TII->getSchedClass(Desc, MI.operands(), MI.getMF()->getRegInfo()); const MemoryBankBits MemoryBanks = getMemoryBanks(&MI); @@ -483,18 +518,41 @@ bool AIEHazardRecognizer::checkConflict( MemoryBanks, TII->getMemoryCycles(SchedClass), DeltaCycles, std::nullopt); } -bool AIEHazardRecognizer::checkConflict( +ConflictTypeBits AIEHazardRecognizer::checkConflict( const ResourceScoreboard &Scoreboard, const InstrItineraryData *ItinData, unsigned SchedClass, SlotBits SlotSet, MemoryBankBits MemoryBanks, SmallVector MemoryAccessCycles, int DeltaCycles, std::optional FUDepthLimit) { assert(Scoreboard.isValidDelta(DeltaCycles)); + ConflictTypeBits Conflict = static_cast(ConflictType::NoConflict); + + Conflict |= checkFormatConflict(Scoreboard, DeltaCycles, SlotSet); + + Conflict |= checkMemoryBankConflict(MemoryAccessCycles, Scoreboard, + DeltaCycles, MemoryBanks); + + Conflict |= checkFUConflict(ItinData, SchedClass, DeltaCycles, Scoreboard, + FUDepthLimit); + return Conflict; +} + +// Return true if there is a conflict due to format. +ConflictTypeBits AIEHazardRecognizer::checkFormatConflict( + const ResourceScoreboard &Scoreboard, int DeltaCycles, + unsigned SlotSet) { // Verify format hazards FuncUnitWrapper EmissionCycle(/*Req=*/0, /*Res=*/0, SlotSet); - if (EmissionCycle.conflict(Scoreboard[DeltaCycles])) - return true; + return static_cast(EmissionCycle.conflict(Scoreboard[DeltaCycles]) + ? ConflictType::Format + : ConflictType::NoConflict); +} +// Return true if there is a conflict due to memory banks. +ConflictTypeBits AIEHazardRecognizer::checkMemoryBankConflict( + const SmallVector &MemoryAccessCycles, + const ResourceScoreboard &Scoreboard, int DeltaCycles, + unsigned MemoryBanks) { // Verify memory bank hazards if (!MemoryAccessCycles.empty()) { FuncUnitWrapper MemoryBankAccessCycle(/*Req=*/0, /*Res=*/0, /*SlotSet=*/0, @@ -506,19 +564,26 @@ bool AIEHazardRecognizer::checkConflict( LLVM_DEBUG(dbgs() << "*** Memory bank conflict in cycle=" << AccessCycle << ":\n"; MemoryBankAccessCycle.dump(); dbgs() << "\n"); - return true; + return static_cast(ConflictType::MemoryBank); } } } + return static_cast(ConflictType::NoConflict); +} + +// Return true if there is a conflict in the functional units. +ConflictTypeBits AIEHazardRecognizer::checkFUConflict( + const InstrItineraryData *ItinData, unsigned SchedClass, int DeltaCycles, + const ResourceScoreboard &Scoreboard, + const std::optional &FUDepthLimit) { // Note that Delta will be negative for bottom-up scheduling. // Cycle is 'our' cycle at which each stage of the itinerary starts. // It gets updated by the increment from the InstrStage. int Cycle = DeltaCycles; for (const InstrStage &IS : ItinData->getStages(SchedClass)) { - if (FUDepthLimit && (Cycle - DeltaCycles) >= *FUDepthLimit) { + if (FUDepthLimit && (Cycle - DeltaCycles) >= *FUDepthLimit) break; - } // Check availability of this stage's resources for the specified number // of cycles const FuncUnitWrapper ThisCycle(IS); @@ -526,19 +591,14 @@ bool AIEHazardRecognizer::checkConflict( int StageCycle = Cycle + (int)C; assert(StageCycle < Scoreboard.getDepth()); - if (ThisCycle.conflict(Scoreboard[StageCycle])) { - LLVM_DEBUG(dbgs() << "*** Hazard in cycle=" << StageCycle - << " EC=" << StageCycle - DeltaCycles << ":\n"; - ThisCycle.dump(); dbgs() << "\n"); - return true; - } + if (ThisCycle.conflict(Scoreboard[StageCycle])) + return static_cast(ConflictType::FuncUnit); } // Advance the cycle to the next stage. Cycle += IS.getNextCycles(); } - - return false; + return static_cast(ConflictType::NoConflict); } void AIEHazardRecognizer::emitInScoreboard( @@ -566,6 +626,26 @@ void AIEHazardRecognizer::emitInScoreboard( TII->getMemoryCycles(SchedClass), DeltaCycles, FUDepthLimit); } +void AIEHazardRecognizer::releaseFromScoreboard( + const MCInstrDesc &Desc, MemoryBankBits MemoryBanks, + iterator_range MIOperands, + const MachineRegisterInfo &MRI, int DeltaCycles) { + releaseFromScoreboard(Scoreboard, Desc, MemoryBanks, MIOperands, MRI, + DeltaCycles); +} + +void AIEHazardRecognizer::releaseFromScoreboard( + ResourceScoreboard &TheScoreboard, const MCInstrDesc &Desc, + MemoryBankBits MemoryBanks, + iterator_range MIOperands, + const MachineRegisterInfo &MRI, int DeltaCycles) const { + const unsigned SchedClass = TII->getSchedClass(Desc, MIOperands, MRI); + const SlotBits SlotSet = + getSlotSet(Desc, *TII->getFormatInterface(), IgnoreUnknownSlotSets); + releaseResources(TheScoreboard, ItinData, SchedClass, SlotSet, MemoryBanks, + TII->getMemoryCycles(SchedClass), DeltaCycles, FUDepthLimit); +} + void AIEHazardRecognizer::enterResources( ResourceScoreboard &Scoreboard, const InstrItineraryData *ItinData, unsigned SchedClass, SlotBits SlotSet, @@ -607,6 +687,47 @@ void AIEHazardRecognizer::enterResources( }); } +void AIEHazardRecognizer::releaseResources( + ResourceScoreboard &Scoreboard, + const InstrItineraryData *ItinData, unsigned SchedClass, SlotBits SlotSet, + MemoryBankBits MemoryBanks, SmallVector MemoryAccessCycles, + int DeltaCycles, std::optional FUDepthLimit) { + assert(Scoreboard.isValidDelta(DeltaCycles)); + + // Remove slot usage + FuncUnitWrapper EmissionCycle(/*Req=*/0, /*Res=*/0, SlotSet); + Scoreboard[DeltaCycles] ^= EmissionCycle; + + // Remove memory bank usage + if (!MemoryAccessCycles.empty()) { + FuncUnitWrapper MemoryBankAccessCycle(/*Req=*/0, /*Res=*/0, /*SlotSet=*/0, + MemoryBanks); + for (int Cycles : MemoryAccessCycles) { + Scoreboard[DeltaCycles + Cycles - 1] ^= MemoryBankAccessCycle; + } + } + + int Cycle = DeltaCycles; + Scoreboard[Cycle].IssueCount--; + for (const InstrStage &IS : ItinData->getStages(SchedClass)) { + if (FUDepthLimit && (Cycle - DeltaCycles) >= *FUDepthLimit) { + break; + } + const FuncUnitWrapper ResourceToRelease(IS); + for (unsigned int C = 0; C < IS.getCycles(); ++C) { + Scoreboard[Cycle + C] ^= ResourceToRelease; + } + + // Advance the cycle to the next stage. + Cycle += IS.getNextCycles(); + } + + LLVM_DEBUG({ + dbgs() << "Scoreboard after release resources:\n"; + Scoreboard.dump(); + }); +} + unsigned AIEHazardRecognizer::getPipelineDepth() const { return PipelineDepth; } unsigned AIEHazardRecognizer::getMaxLatency() const { return MaxLatency; } diff --git a/llvm/lib/Target/AIE/AIEHazardRecognizer.h b/llvm/lib/Target/AIE/AIEHazardRecognizer.h index 65a791a91979..fae47876fa5d 100644 --- a/llvm/lib/Target/AIE/AIEHazardRecognizer.h +++ b/llvm/lib/Target/AIE/AIEHazardRecognizer.h @@ -29,6 +29,7 @@ namespace llvm { class MachineInstr; +using ConflictTypeBits = std::uint32_t; void applyFormatOrdering(AIE::MachineBundle &Bundle, const VLIWFormat &Format, MachineInstr *BundleRoot, @@ -97,6 +98,7 @@ class FuncUnitWrapper { void dump() const; FuncUnitWrapper &operator|=(const FuncUnitWrapper &Other); + FuncUnitWrapper &operator^=(const FuncUnitWrapper &Other); bool conflict(const FuncUnitWrapper &Other) const; }; @@ -111,6 +113,13 @@ class AIEHazardRecognizer : public ScheduleHazardRecognizer { void computeMaxLatency(); public: + enum class ConflictType : std::uint32_t { + NoConflict = 0b000, + Format = 0b001, + MemoryBank = 0b010, + FuncUnit = 0b100, + }; + /// ScoreboardDepth can be used to speficy a fixed depth without querying the /// scheduling model. This is mostly used for testing, for other cases we /// should trust the instruction itineraries. @@ -161,6 +170,18 @@ class AIEHazardRecognizer : public ScheduleHazardRecognizer { iterator_range MIOperands, const MachineRegisterInfo &MRI, int DeltaCycles); + void releaseFromScoreboard(ResourceScoreboard &Scoreboard, + const MCInstrDesc &Desc, + MemoryBankBits MemoryBanks, + iterator_range MIOperands, + const MachineRegisterInfo &MRI, + int DeltaCycles) const; + // Apply the above function to the local scoreboard. + void releaseFromScoreboard(const MCInstrDesc &Desc, + MemoryBankBits MemoryBanks, + iterator_range MIOperands, + const MachineRegisterInfo &MRI, int DeltaCycles); + /// Block all scoreboard resources at DeltaCycles void blockCycleInScoreboard(int DeltaCycle); @@ -201,19 +222,48 @@ class AIEHazardRecognizer : public ScheduleHazardRecognizer { const MCInstrDesc &Desc, MemoryBankBits MemoryBanks, iterator_range MIOperands, const MachineRegisterInfo &MRI, int DeltaCycles) const; - bool checkConflict(const ResourceScoreboard &Scoreboard, - MachineInstr &MI, int DeltaCycles) const; + ScheduleHazardRecognizer::HazardType + getHazardType(const MCInstrDesc &Desc, MemoryBankBits MemoryBanks, + iterator_range MIOperands, + const MachineRegisterInfo &MRI, int DeltaCycles); + + ConflictTypeBits + checkConflict(const ResourceScoreboard &Scoreboard, + MachineInstr &MI, const MCInstrDesc &Desc, + int DeltaCycles) const; + ConflictTypeBits checkConflict(MachineInstr &MI, const MCInstrDesc &Desc, + int DeltaCycles); + + ConflictTypeBits + checkConflict(const ResourceScoreboard &Scoreboard, + MachineInstr &MI, int DeltaCycles) const; + ConflictTypeBits checkConflict(MachineInstr &MI, int DeltaCycles); protected: ScheduleHazardRecognizer::HazardType getHazardType(const MCInstrDesc &Desc, int DeltaCycles); - static bool + static ConflictTypeBits checkConflict(const ResourceScoreboard &Scoreboard, const InstrItineraryData *ItinData, unsigned SchedClass, SlotBits SlotSet, MemoryBankBits MemoryBanks, SmallVector MemoryAccessCycles, int DeltaCycles, std::optional FUDepthLimit); + static ConflictTypeBits + checkFormatConflict(const ResourceScoreboard &Scoreboard, + int DeltaCycles, unsigned SlotSet); + + static ConflictTypeBits + checkMemoryBankConflict(const SmallVector &MemoryAccessCycles, + const ResourceScoreboard &Scoreboard, + int DeltaCycles, unsigned MemoryBanks); + + static ConflictTypeBits + checkFUConflict(const InstrItineraryData *ItinData, unsigned SchedClass, + int DeltaCycles, + const ResourceScoreboard &Scoreboard, + const std::optional &FUDepthLimit); + static void enterResources(ResourceScoreboard &Scoreboard, const InstrItineraryData *ItinData, unsigned SchedClass, SlotBits SlotSet, @@ -221,6 +271,14 @@ class AIEHazardRecognizer : public ScheduleHazardRecognizer { SmallVector MemoryAccessCycles, int DeltaCycles, std::optional FUDepthLimit); + static void releaseResources(ResourceScoreboard &Scoreboard, + const InstrItineraryData *ItinData, + unsigned SchedClass, SlotBits SlotSet, + MemoryBankBits MemoryBanks, + SmallVector MemoryAccessCycles, + int DeltaCycles, + std::optional FUDepthLimit); + private: ResourceScoreboard Scoreboard; const AIEBaseInstrInfo *TII; diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index d6896bd624da..e5fde41f8b45 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -8,11 +8,18 @@ // //===----------------------------------------------------------------------===// -#include "AIEMachineScheduler.h" +#include "AIE2.h" +#include "AIE2GenInstrInfo.inc" +#include "AIE2InstrInfo.h" +#include "AIE2RegisterInfo.h" +#include "AIE2Subtarget.h" +#include "AIE2TargetMachine.h" + #include "AIEBaseAliasAnalysis.h" #include "AIEBaseInstrInfo.h" #include "AIEHazardRecognizer.h" #include "AIEInterBlockScheduling.h" +#include "AIEMachineScheduler.h" #include "AIEMaxLatencyFinder.h" #include "AIEPostPipeliner.h" #include "Utils/AIELoopUtils.h" @@ -93,6 +100,16 @@ static cl::opt PreSchedFollowsSkipPipeliner( "aie-presched-follows-skip-pipeliner", cl::init(true), cl::desc("Don't run the prescheduler if the pipeliner is skipped")); +/// This option enables instruction mutuation to shift a multislot instruction +/// in event of a slot conflict. +static cl::opt InstructionMutation( + "aie-instruction-mutation", cl::init(true), + cl::desc("Allow instruction mutation to shift a multislot " + "instruction in event of a slot conflict")); + +static cl::opt + UseDelayedMove("aie-use-delayed-move", cl::init(true), + cl::desc("Allow delayed move to resolve FU conflict")); namespace { // A sentinel value to represent an unknown SUnit. const constexpr unsigned UnknownSUNum = ~0; @@ -122,6 +139,12 @@ const AIEBaseInstrInfo *getTII(const ScheduleDAGMI &DAG) { return static_cast(DAG.TII); } +/// Shorthand to get TargetRegisterInfo from a MachineBasicBlock +const AIEBaseRegisterInfo *getTRI(MachineBasicBlock *MBB) { + return static_cast( + MBB->getParent()->getSubtarget().getRegisterInfo()); +} + void bumpCycleForBundles(unsigned ToCycle, std::vector &Bundles, AIE::MachineBundle &CurrBundle) { @@ -532,6 +555,229 @@ bool AIEPostRASchedStrategy::isFreeSU(const SUnit &SU) const { SU.NodeNum < FirstBotFixedSU.value_or(NumUpperBound); } +bool AIEPostRASchedStrategy::canUseDelayedMove(SUnit &SU, SchedBoundary &Zone, + const int DeltaCycle) { + + if (!UseDelayedMove) + return false; + + const AIEBaseInstrInfo *TII = getTII(CurMBB); + const AIEBaseRegisterInfo *TRI = getTRI(CurMBB); + MachineInstr *MI = SU.getInstr(); + + if (!TII->isScalarMove(MI->getOpcode())) + return false; + + // Check if the destination register is a eRType register + const Register DstReg = MI->getOperand(0).getReg(); + if (!TRI->isRTypeReg(DstReg)) + return false; + + // Check if the conflict is only due to FU + AIEHazardRecognizer &HR = *getAIEHazardRecognizer(Zone); + if (!(HR.checkConflict(*MI, DeltaCycle) & + static_cast(AIEHazardRecognizer::ConflictType::FuncUnit))) + return false; + + // Check if the instruction has only one successor and it is the ExitSU + if (SU.Succs.size() == 1 && SU.Succs[0].getSUnit() == &Zone.DAG->ExitSU) + return false; + + // Find the max of the BotReadyCycle of the successors, to find by how many + // cycles the MOV can be delayed + unsigned MaxBotReadyCycle = 0; + for (const SDep &Succ : SU.Succs) { + // Anit-dependencies are not considered since the delayes MOV reads the src + // reg in the same cycle + if (Succ.getSUnit() == &Zone.DAG->ExitSU || Succ.getKind() == SDep::Anti) + continue; + MaxBotReadyCycle = + std::max(MaxBotReadyCycle, Succ.getSUnit()->BotReadyCycle); + }; + + const int CurrCycle = Zone.getCurrCycle(); + const unsigned CanDelayBy = + static_cast(CurrCycle - DeltaCycle) - MaxBotReadyCycle - 1; + if (!CanDelayBy) + return false; + + unsigned Delay = 1; + for (const auto &DelMove : TII->getDelayedScalarMoveOpcode()) { + if (Delay > CanDelayBy) + break; + const unsigned Conflict = + HR.checkConflict(*MI, TII->get(DelMove), DeltaCycle); + if (!(Conflict)) { + HR.getSelectedAltDescs().setAlternateDescriptor(MI, DelMove); + return true; + } + ++Delay; + } + + return false; +} + +static bool checkSlotConflict(const unsigned OpCodeA, const unsigned OpCodeB, + const AIEBaseMCFormats &Formats) { + const MCSlotKind SlotKindA = Formats.getSlotKind(OpCodeA); + const MCSlotKind SlotKindB = Formats.getSlotKind(OpCodeB); + const MCSlotKind UnknownSlot = MCSlotKind(); + + if (SlotKindA == UnknownSlot || SlotKindB == UnknownSlot) + return false; + + const auto *SlotInfoA = Formats.getSlotInfo(SlotKindA); + const auto *SlotInfoB = Formats.getSlotInfo(SlotKindB); + + return SlotInfoA->getSlotSet() & SlotInfoB->getSlotSet(); +} + +// Check if moving a scheduled multi-slot instruction to a different slot allows +// us to schedule SU in the same DeltaCycle. If a multi-slot instruction +// candidate is identified, it is preserved in a map so that when SU is +// scheduled, the candidate is first moved to a new slot and then the SU is +// emitted. +bool AIEPostRASchedStrategy::canShiftSlot(SUnit &SU, SchedBoundary &Zone, + const int DeltaCycle) { + + if (!InstructionMutation) + return false; + + const ScheduleDAGMI &DAG = *Zone.DAG; + const AIEBaseInstrInfo &TII = *getTII(DAG); + const AIEBaseMCFormats &Formats = *TII.getFormatInterface(); + AIEHazardRecognizer &HR = *getAIEHazardRecognizer(Zone); + MachineInstr *NewMI = SU.getInstr(); + std::vector ScheduledMultiSlotInsts; + bool CanShiftSlot = false; + + // Find and cache if there are any multi-slot instructions scheduled in the + // same delta cycle + for (MachineInstr &MI : DAG) { + SUnit *ZoneSU = DAG.getSUnit(&MI); + if (!ZoneSU) + continue; + if (!ZoneSU->isScheduled) + continue; + + const int CurrCycle = Zone.getCurrCycle(); + if (ZoneSU->BotReadyCycle != + static_cast(CurrCycle - DeltaCycle)) + continue; + + // Check for a MultiSlot instruction scheduled in the same DeltaCycle, + // we focus on multi-slot because they can be scheduled in different + // slots + auto AltOpcodes = Formats.getAlternateInstsOpcode(MI.getOpcode()); + if (!AltOpcodes) + continue; + ScheduledMultiSlotInsts.push_back(&MI); + } + + // If there are no multi-slot instructions scheduled in the same DeltaCycle we + // cannot shift any instruction to a different slot. + if (ScheduledMultiSlotInsts.empty()) + return false; + + auto DefaultOpcode = std::vector{SU.getInstr()->getOpcode()}; + const std::vector *NewMIAltOpcodes = + Formats.getAlternateInstsOpcode(SU.getInstr()->getOpcode()) + ? Formats.getAlternateInstsOpcode(SU.getInstr()->getOpcode()) + : &DefaultOpcode; + + for (const unsigned int NewMIAltOpcode : *NewMIAltOpcodes) { + const MCInstrDesc &NewMIAltDesc = TII.get(NewMIAltOpcode); + if (!(HR.checkConflict(*NewMI, NewMIAltDesc, DeltaCycle) & + static_cast(AIEHazardRecognizer::ConflictType::Format))) + continue; + + for (MachineInstr *MI : ScheduledMultiSlotInsts) { + SUnit *ZoneSU = DAG.getSUnit(MI); + const int CurrCycle = Zone.getCurrCycle(); + auto AltOpcodes = Formats.getAlternateInstsOpcode(MI->getOpcode()); + + // Check if the scheduled multi-slot instruction has a slot conflict + // with the new instruction, if so we might have the possiblity to shift + // the multi-slot and schedule the new instruction. + if (!checkSlotConflict(HR.getSelectedAltDescs().getOpcode(MI), + NewMIAltOpcode, Formats)) + continue; + + // Release the multi-slot instruction from the scoreboard to check if + // any other alternate opcode in presence of the new instruction will + // not create a hazard. + HR.releaseFromScoreboard(*HR.getSelectedAltDescs().getDesc(MI), + HR.getMemoryBanks(MI), MI->operands(), + MI->getMF()->getRegInfo(), + CurrCycle - ZoneSU->BotReadyCycle); + + // Check if the new instruction can be scheduled after unscheduling + // the conflicting multi-slot instruction. + if (HR.getHazardType(NewMIAltDesc, HR.getMemoryBanks(NewMI), + NewMI->operands(), NewMI->getMF()->getRegInfo(), + DeltaCycle) != + ScheduleHazardRecognizer::HazardType::NoHazard) { + // If the new instruction cannot be scheduled after unscheduling the + // mulit-slot revert back the state of scoreboard to original state + // and continue. + + HR.emitInScoreboard(*HR.getSelectedAltDescs().getDesc(MI), + HR.getMemoryBanks(MI), MI->operands(), + MI->getMF()->getRegInfo(), + CurrCycle - ZoneSU->BotReadyCycle); + continue; + } + + // Emit the new instruction in the scoreboard. This will help us + // to check if the previously unscheduled multi-slot instruction + // can be scheduled in the same cycle, with an alternate opcode. + HR.emitInScoreboard(NewMIAltDesc, HR.getMemoryBanks(NewMI), + NewMI->operands(), NewMI->getMF()->getRegInfo(), + DeltaCycle); + + // Check if the previously unscheduled multi-slot instruction + // can be rescheduled in presense of the new instruction in the + // same cycle, with a different opcode. + for (const auto AltOpcodeInside : *AltOpcodes) { + const MCInstrDesc &Desc = TII.get(AltOpcodeInside); + if (HR.getHazardType(Desc, HR.getMemoryBanks(MI), MI->operands(), + MI->getMF()->getRegInfo(), DeltaCycle) == + ScheduleHazardRecognizer::HazardType::NoHazard) { + // Cache the information to mutate the instruction during bumpNode() + MutateInstruction.insert( + std::make_pair(NewMI, std::make_pair(MI, &Desc))); + + // if the new instruction was a multi-slot instruction and it failed + // the general check for isAvailabeNode() this means we have not set + // the selected opcode for the instruction. Set the selected opcode + // for the instruction. + if (NewMIAltOpcodes->size() > 1) + HR.getSelectedAltDescs().setAlternateDescriptor(NewMI, + &NewMIAltDesc); + + CanShiftSlot = true; + break; + } + } + + // Revert back the state of scoreboard to original state. + HR.releaseFromScoreboard(NewMIAltDesc, HR.getMemoryBanks(NewMI), + NewMI->operands(), NewMI->getMF()->getRegInfo(), + DeltaCycle); + HR.emitInScoreboard(*HR.getSelectedAltDescs().getDesc(MI), + HR.getMemoryBanks(MI), MI->operands(), + MI->getMF()->getRegInfo(), + CurrCycle - ZoneSU->BotReadyCycle); + + if (CanShiftSlot) + break; + } + if (CanShiftSlot) + break; + } + return CanShiftSlot; +} + bool AIEPostRASchedStrategy::isAvailableNode(SUnit &SU, SchedBoundary &Zone, bool /*VerifyReadyCycle*/) { // Note we use signed integers to avoid wrap-around behavior. @@ -561,7 +807,9 @@ bool AIEPostRASchedStrategy::isAvailableNode(SUnit &SU, SchedBoundary &Zone, // ReadyCycle is always greater or equal to the current cycle, // so DeltaCycles will always be less or equal to 0. if (Zone.checkHazard(&SU, DeltaCycles)) - continue; + if (!canShiftSlot(SU, Zone, DeltaCycles)) + if (!canUseDelayedMove(SU, Zone, DeltaCycles)) + continue; SU.BotReadyCycle = CurrCycle - DeltaCycles; return true; } @@ -578,10 +826,35 @@ void AIEPostRASchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { if (IsTopNode) { PostGenericScheduler::schedNode(SU, IsTopNode); } else { + AIEHazardRecognizer &HR = *getAIEHazardRecognizer(Bot); int DeltaCycles = int(Bot.getCurrCycle()) - int(SU->BotReadyCycle); assert(DeltaCycles <= 0); + + // Check if an instruction needs to be moved to a different slot for the + // current SU to be scheduled in the DeltaCycles. + if (MutateInstruction.find(SU->getInstr()) != MutateInstruction.end()) { + auto &[MI, Desc] = MutateInstruction[SU->getInstr()]; + HR.releaseFromScoreboard(*HR.getSelectedAltDescs().getDesc(MI), + HR.getMemoryBanks(MI), MI->operands(), + MI->getMF()->getRegInfo(), DeltaCycles); + // Update the selected opcode for the instruction, refer + // AIEPostRASchedStrategy::canShiftSlot() + HR.getSelectedAltDescs().setAlternateDescriptor(MI, Desc); + + assert(HR.getHazardType(*Desc, HR.getMemoryBanks(MI), MI->operands(), + MI->getMF()->getRegInfo(), DeltaCycles) == + ScheduleHazardRecognizer::HazardType::NoHazard); + // Reschedule the instruction with the new opcode. + HR.emitInScoreboard(*Desc, HR.getMemoryBanks(MI), MI->operands(), + MI->getMF()->getRegInfo(), DeltaCycles); + } + Bot.bumpNode(SU, DeltaCycles); } + // Clear the MutateInstruction map since after scheduling the instruction the + // validity of mutation map can no longer be guaranteed. + MutateInstruction.clear(); + SU->isScheduled = true; } void AIEPostRASchedStrategy::enterFunction(MachineFunction *MF) { diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.h b/llvm/lib/Target/AIE/AIEMachineScheduler.h index 213aa48f714b..eacbc9ed9194 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.h +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.h @@ -37,6 +37,7 @@ std::vector computeAndFinalizeBundles(SchedBoundary &Zone); class AIEPostRASchedStrategy : public PostGenericScheduler { /// Maintain the state of interblock/loop-aware scheduling AIE::InterBlockScheduling InterBlock; + MutateInstructionMap MutateInstruction; public: AIEPostRASchedStrategy(const MachineSchedContext *C); @@ -50,6 +51,8 @@ class AIEPostRASchedStrategy : public PostGenericScheduler { SUnit *pickNodeAndCycle(bool &IsTopNode, std::optional &BotEmissionCycle) override; + bool canShiftSlot(SUnit &SU, SchedBoundary &Zone, const int DeltaCycle); + bool canUseDelayedMove(SUnit &SU, SchedBoundary &Zone, const int DeltaCycle); bool isAvailableNode(SUnit &SU, SchedBoundary &Zone, bool VerifyReadyCycle) override; diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll b/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll index 2915bbb173ef..ac0d01a076bd 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll @@ -12,15 +12,13 @@ define void @test_simple_dyn_alloca(i32 noundef %n) { ; CHECK-LABEL: test_simple_dyn_alloca: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32; nopa ; nops ; nopxm ; nopv -; CHECK-NEXT: mova r1, #2; nopx +; CHECK-NEXT: padda [sp], #32; nopb ; movx r1, #2 +; CHECK-NEXT: lshl r0, r0, r1 ; CHECK-NEXT: st p7, [sp, #-32] // 4-byte Folded Spill ; CHECK-NEXT: mov p7, sp ; CHECK-NEXT: mov p1, sp -; CHECK-NEXT: lshl r0, r0, r1 -; CHECK-NEXT: mova r1, #-32 ; CHECK-NEXT: st lr, [sp, #-28] // 4-byte Folded Spill -; CHECK-NEXT: padda [p7], #-32 +; CHECK-NEXT: mova r1, #-32; padds [p7], #-32 ; CHECK-NEXT: add r0, r0, #31 ; CHECK-NEXT: jl #extern_call ; CHECK-NEXT: mov p0, p1 // Delay Slot 5 @@ -53,24 +51,19 @@ define void @test_loop_dyn_alloca(i32 noundef %n) { ; CHECK-LABEL: test_loop_dyn_alloca: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; paddb [sp], #64; nopxm -; CHECK-NEXT: st p7, [sp, #-64] // 4-byte Folded Spill -; CHECK-NEXT: mov p7, sp -; CHECK-NEXT: st r16, [sp, #-36] // 4-byte Folded Spill -; CHECK-NEXT: mova r16, #1 +; CHECK-NEXT: paddb [sp], #64; nopa ; nops ; nopxm ; nopv +; CHECK-NEXT: st r16, [sp, #-36]; nopx // 4-byte Folded Spill ; CHECK-NEXT: st r17, [sp, #-40] // 4-byte Folded Spill -; CHECK-NEXT: mova r17, #0 ; CHECK-NEXT: st r18, [sp, #-44] // 4-byte Folded Spill -; CHECK-NEXT: mova r18, #10 +; CHECK-NEXT: mova r16, #1; movx r18, #10; mov r17, #0 ; CHECK-NEXT: st r19, [sp, #-48] // 4-byte Folded Spill -; CHECK-NEXT: mova r19, #2 ; CHECK-NEXT: st r20, [sp, #-52] // 4-byte Folded Spill -; CHECK-NEXT: mova r20, #-32 ; CHECK-NEXT: st r21, [sp, #-56] // 4-byte Folded Spill -; CHECK-NEXT: mova r21, #0 +; CHECK-NEXT: st p7, [sp, #-64] // 4-byte Folded Spill +; CHECK-NEXT: mov p7, sp ; CHECK-NEXT: st lr, [sp, #-32] // 4-byte Folded Spill ; CHECK-NEXT: st p6, [sp, #-60] // 4-byte Folded Spill -; CHECK-NEXT: padda [p7], #-64 +; CHECK-NEXT: mova r19, #2; padds [p7], #-64; movx r21, #0; mov r20, #-32 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -133,11 +126,9 @@ define void @test_huge_stack(i32 noundef %n) #0 { ; CHECK-LABEL: test_huge_stack: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; paddb [sp], #40064; nopx +; CHECK-NEXT: nopa ; paddb [sp], #40064; nopxm ; CHECK-NEXT: movxm m0, #-40064 -; CHECK-NEXT: mova r1, #0 -; CHECK-NEXT: mova r2, #2 -; CHECK-NEXT: mova r3, #-32 +; CHECK-NEXT: mova r1, #0; movx r3, #-32; mov r2, #2 ; CHECK-NEXT: st p7, [sp, #-40064] // 4-byte Folded Spill ; CHECK-NEXT: mov p7, sp ; CHECK-NEXT: mov p1, sp @@ -150,11 +141,10 @@ define void @test_huge_stack(i32 noundef %n) #0 { ; CHECK-NEXT: mov p2, p7 ; CHECK-NEXT: mov p6, p7 ; CHECK-NEXT: paddb [p0], m0 -; CHECK-NEXT: paddb [p6], #-32 ; CHECK-NEXT: movxm m0, #-40032 ; CHECK-NEXT: st r0, [p0, #0] ; CHECK-NEXT: lda r0, [p0, #0] -; CHECK-NEXT: paddb [p2], m0 +; CHECK-NEXT: paddb [p6], #-32; padds [p2], m0 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov r16, p2 ; CHECK-NEXT: st p0, [p6, #0] diff --git a/llvm/test/CodeGen/AIE/aie2/accfloat.ll b/llvm/test/CodeGen/AIE/aie2/accfloat.ll index 789f45baac0b..06c05bc988ea 100644 --- a/llvm/test/CodeGen/AIE/aie2/accfloat.ll +++ b/llvm/test/CodeGen/AIE/aie2/accfloat.ll @@ -11,9 +11,7 @@ define dso_local noundef <8 x i64> @test_add_conf(<8 x i64> noundef %acc1, <8 x ; CHECK-LABEL: test_add_conf: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; mova r3, #12; nops ; nopxm ; nopv -; CHECK-NEXT: mova r4, #13; nopx -; CHECK-NEXT: mova r5, #28 +; CHECK-NEXT: mova r3, #12; movx r5, #28; mov r4, #13 ; CHECK-NEXT: lshl r1, r1, r3 ; CHECK-NEXT: lshl r2, r2, r4 ; CHECK-NEXT: or r0, r1, r0 @@ -41,9 +39,7 @@ define dso_local noundef <8 x i64> @test_sub_conf(<8 x i64> noundef %acc1, <8 x ; CHECK-LABEL: test_sub_conf: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; mova r3, #12; nops ; nopxm ; nopv -; CHECK-NEXT: mova r4, #13; nopx -; CHECK-NEXT: mova r5, #28 +; CHECK-NEXT: mova r3, #12; movx r5, #28; mov r4, #13 ; CHECK-NEXT: lshl r1, r1, r3 ; CHECK-NEXT: lshl r2, r2, r4 ; CHECK-NEXT: or r0, r1, r0 diff --git a/llvm/test/CodeGen/AIE/aie2/addr_1d2d3d.ll b/llvm/test/CodeGen/AIE/aie2/addr_1d2d3d.ll index d39b5d32afe6..7cffd7efb679 100644 --- a/llvm/test/CodeGen/AIE/aie2/addr_1d2d3d.ll +++ b/llvm/test/CodeGen/AIE/aie2/addr_1d2d3d.ll @@ -170,9 +170,8 @@ define dso_local ptr @test_add_2d_ptr_backTOback_call(ptr %a, i32 noundef %off, ; CHECK-LABEL: test_add_2d_ptr_backTOback_call: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov p0, p1 -; CHECK-NEXT: mova r3, #6 -; CHECK-NEXT: mova dc0, #0 +; CHECK-NEXT: nopa ; nopx ; mov p0, p1 +; CHECK-NEXT: mova r3, #6; mov dc0, #0 ; CHECK-NEXT: mov dn0, r1 ; CHECK-NEXT: lshl r0, r0, r3 ; CHECK-NEXT: ret lr @@ -200,9 +199,8 @@ define dso_local ptr @test_add_3d_ptr_backTOback_call(ptr %a, i32 noundef %off, ; CHECK-LABEL: test_add_3d_ptr_backTOback_call: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov p0, p1 -; CHECK-NEXT: mova r5, #6 -; CHECK-NEXT: mova dc0, #0 +; CHECK-NEXT: nopa ; nopx ; mov p0, p1 +; CHECK-NEXT: mova r5, #6; mov dc0, #0 ; CHECK-NEXT: mov dn0, r1 ; CHECK-NEXT: mov dn4, r3 ; CHECK-NEXT: lshl r0, r0, r5 diff --git a/llvm/test/CodeGen/AIE/aie2/aiev2_v2int32.ll b/llvm/test/CodeGen/AIE/aie2/aiev2_v2int32.ll index 81cfb1e47739..037a5157b8a2 100644 --- a/llvm/test/CodeGen/AIE/aie2/aiev2_v2int32.ll +++ b/llvm/test/CodeGen/AIE/aie2/aiev2_v2int32.ll @@ -29,8 +29,8 @@ define dso_local noundef i64 @_Z14return_v2int32v() local_unnamed_addr #0 { ; CHECK-NEXT: nopx // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mova r0, #100 // Delay Slot 2 -; CHECK-NEXT: mova r1, #0 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: mova r0, #100; movx r1, #0 // Delay Slot 1 entry: ret i64 100 } diff --git a/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll b/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll index 337fecd1e4bd..ab01f8ad3348 100644 --- a/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll +++ b/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll @@ -12,15 +12,13 @@ define void @test_simple_dyn_alloca(i32 noundef %n) { ; CHECK-LABEL: test_simple_dyn_alloca: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32; nopa ; nops ; nopxm ; nopv -; CHECK-NEXT: mova r1, #2; nopx +; CHECK-NEXT: padda [sp], #32; nopb ; movx r1, #2 +; CHECK-NEXT: lshl r0, r0, r1 ; CHECK-NEXT: st p7, [sp, #-32] // 4-byte Folded Spill ; CHECK-NEXT: mov p7, sp ; CHECK-NEXT: mov p1, sp -; CHECK-NEXT: lshl r0, r0, r1 -; CHECK-NEXT: mova r1, #-32 ; CHECK-NEXT: st lr, [sp, #-28] // 4-byte Folded Spill -; CHECK-NEXT: padda [p7], #-32 +; CHECK-NEXT: mova r1, #-32; padds [p7], #-32 ; CHECK-NEXT: add r0, r0, #31 ; CHECK-NEXT: jl #extern_call ; CHECK-NEXT: mov p0, p1 // Delay Slot 5 @@ -53,24 +51,19 @@ define void @test_loop_dyn_alloca(i32 noundef %n) { ; CHECK-LABEL: test_loop_dyn_alloca: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; paddb [sp], #64; nopxm -; CHECK-NEXT: st p7, [sp, #-64] // 4-byte Folded Spill -; CHECK-NEXT: mov p7, sp -; CHECK-NEXT: st r16, [sp, #-36] // 4-byte Folded Spill -; CHECK-NEXT: mova r16, #1 +; CHECK-NEXT: paddb [sp], #64; nopa ; nops ; nopxm ; nopv +; CHECK-NEXT: st r16, [sp, #-36]; nopx // 4-byte Folded Spill ; CHECK-NEXT: st r17, [sp, #-40] // 4-byte Folded Spill -; CHECK-NEXT: mova r17, #0 ; CHECK-NEXT: st r18, [sp, #-44] // 4-byte Folded Spill -; CHECK-NEXT: mova r18, #10 +; CHECK-NEXT: mova r16, #1; movx r18, #10; mov r17, #0 ; CHECK-NEXT: st r19, [sp, #-48] // 4-byte Folded Spill -; CHECK-NEXT: mova r19, #2 ; CHECK-NEXT: st r20, [sp, #-52] // 4-byte Folded Spill -; CHECK-NEXT: mova r20, #-32 ; CHECK-NEXT: st r21, [sp, #-56] // 4-byte Folded Spill -; CHECK-NEXT: mova r21, #0 +; CHECK-NEXT: st p7, [sp, #-64] // 4-byte Folded Spill +; CHECK-NEXT: mov p7, sp ; CHECK-NEXT: st lr, [sp, #-32] // 4-byte Folded Spill ; CHECK-NEXT: st p6, [sp, #-60] // 4-byte Folded Spill -; CHECK-NEXT: padda [p7], #-64 +; CHECK-NEXT: mova r19, #2; padds [p7], #-64; movx r21, #0; mov r20, #-32 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -133,11 +126,9 @@ define void @test_huge_stack(i32 noundef %n) #0 { ; CHECK-LABEL: test_huge_stack: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; paddb [sp], #40064; nopx +; CHECK-NEXT: nopa ; paddb [sp], #40064; nopxm ; CHECK-NEXT: movxm m0, #-40064 -; CHECK-NEXT: mova r1, #0 -; CHECK-NEXT: mova r2, #2 -; CHECK-NEXT: mova r3, #-32 +; CHECK-NEXT: mova r1, #0; movx r3, #-32; mov r2, #2 ; CHECK-NEXT: st p7, [sp, #-40064] // 4-byte Folded Spill ; CHECK-NEXT: mov p7, sp ; CHECK-NEXT: mov p1, sp @@ -150,11 +141,10 @@ define void @test_huge_stack(i32 noundef %n) #0 { ; CHECK-NEXT: mov p2, p7 ; CHECK-NEXT: mov p6, p7 ; CHECK-NEXT: paddb [p0], m0 -; CHECK-NEXT: paddb [p6], #-32 ; CHECK-NEXT: movxm m0, #-40032 ; CHECK-NEXT: st r0, [p0, #0] ; CHECK-NEXT: lda r0, [p0, #0] -; CHECK-NEXT: paddb [p2], m0 +; CHECK-NEXT: paddb [p6], #-32; padds [p2], m0 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov r16, p2 ; CHECK-NEXT: st p0, [p6, #0] diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll index 17dfb2a60671..48829480f339 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll @@ -72,23 +72,20 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm ; ASM-NEXT: lda r9, [p5, #0]; paddb [p6], #-56; mov p5, sp ; ASM-NEXT: lda r6, [p6, #0]; paddb [p5], #-80; mov p4, sp ; ASM-NEXT: lda r10, [p5, #0]; paddb [p4], #-60; mov p5, sp -; ASM-NEXT: lda p6, [p4, #0]; paddb [p5], #-84 -; ASM-NEXT: lda r11, [p5, #0]; mov p0, sp -; ASM-NEXT: paddb [p0], #-72; mov p4, sp -; ASM-NEXT: lda p0, [p0, #0]; paddb [p4], #-64; mov p5, sp +; ASM-NEXT: lda p6, [p4, #0]; paddb [p5], #-84; mov p4, sp +; ASM-NEXT: lda r11, [p5, #0]; paddb [p4], #-64; mov p5, sp ; ASM-NEXT: lda p7, [p4, #0]; paddb [p5], #-88; mov p4, sp ; ASM-NEXT: lda r12, [p5, #0]; paddb [p4], #-68; mov p5, sp -; ASM-NEXT: lda p4, [p4, #0]; paddb [p5], #-92 -; ASM-NEXT: lda r13, [p5, #0] -; ASM-NEXT: mova r6, #1; add r7, r1, #-1; mov p5, r6 -; ASM-NEXT: mova r6, #3; ne r3, r3, r6 +; ASM-NEXT: lda p4, [p4, #0]; paddb [p5], #-92; add r7, r1, #-1; mov p0, sp +; ASM-NEXT: lda r13, [p5, #0]; paddb [p0], #-72; movx r6, #1; mov p5, r6 +; ASM-NEXT: lda p0, [p0, #0]; ne r3, r3, r6; mov r6, #3 ; ASM-NEXT: ltu r7, r7, r6 ; ASM-NEXT: jz r7, #.LBB0_2 ; ASM-NEXT: st dn4, [p5, #0]; nez r0, r0 // Delay Slot 5 ; ASM-NEXT: st r0, [p6, #0] // Delay Slot 4 ; ASM-NEXT: paddb [p2], m3; st r5, [p7, #0] // Delay Slot 3 -; ASM-NEXT: padda [p1], m2; paddb [p2], m5; and r8, r1, r6; st r3, [p4, #0] // Delay Slot 2 -; ASM-NEXT: mova r6, #0; paddb [p2], m4; st r8, [p0, #0] // Delay Slot 1 +; ASM-NEXT: st r3, [p4, #0]; paddb [p2], m5; and r8, r1, r6 // Delay Slot 2 +; ASM-NEXT: padda [p1], m2; paddb [p2], m4; movx r6, #0; st r8, [p0, #0] // Delay Slot 1 ; ASM-NEXT: // %bb.1: ; ASM-NEXT: nopb ; nopa ; nops ; j #.LBB0_6; nopv ; ASM-NEXT: nopa ; nopx // Delay Slot 5 diff --git a/llvm/test/CodeGen/AIE/aie2/extract.ll b/llvm/test/CodeGen/AIE/aie2/extract.ll index 54630f271bca..f8a6c292fc3b 100644 --- a/llvm/test/CodeGen/AIE/aie2/extract.ll +++ b/llvm/test/CodeGen/AIE/aie2/extract.ll @@ -72,8 +72,7 @@ define dso_local noundef <32 x i8> @_Z30test_extract_v64uint4_256_1024Dv128_DU8_ ; CHECK-LABEL: _Z30test_extract_v64uint4_256_1024Dv128_DU8_i: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #160; nopx -; CHECK-NEXT: mova r2, #4 +; CHECK-NEXT: nopa ; paddb [sp], #160; nopx ; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: st r16, [sp, #-160] // 4-byte Folded Spill ; CHECK-NEXT: vst wl4, [sp, #-128] // 32-byte Folded Spill @@ -82,7 +81,7 @@ define dso_local noundef <32 x i8> @_Z30test_extract_v64uint4_256_1024Dv128_DU8_ ; CHECK-NEXT: vst wl5, [sp, #-64] // 32-byte Folded Spill Delay Slot 4 ; CHECK-NEXT: st lr, [sp, #-156] // 4-byte Folded Spill Delay Slot 3 ; CHECK-NEXT: vst wh5, [sp, #-32] // 32-byte Folded Spill Delay Slot 2 -; CHECK-NEXT: mova r16, #2 // Delay Slot 1 +; CHECK-NEXT: mova r2, #4; movx r16, #2 // Delay Slot 1 ; CHECK-NEXT: nopb ; nopa ; nops ; eq r1, r0, r16; nopm ; nopv ; CHECK-NEXT: jnz r1, #.LBB2_5 ; CHECK-NEXT: nop // Delay Slot 5 @@ -434,8 +433,7 @@ define dso_local noundef <4 x i64> @_Z20test_extract_v8acc32Dv32_u7__acc32i(<16 ; CHECK-LABEL: _Z20test_extract_v8acc32Dv32_u7__acc32i: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #160; nopx -; CHECK-NEXT: mova r2, #4 +; CHECK-NEXT: nopa ; paddb [sp], #160; nopx ; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: st r16, [sp, #-160] // 4-byte Folded Spill ; CHECK-NEXT: vst amll1, [sp, #-128] // 32-byte Folded Spill @@ -444,7 +442,7 @@ define dso_local noundef <4 x i64> @_Z20test_extract_v8acc32Dv32_u7__acc32i(<16 ; CHECK-NEXT: vst amhl1, [sp, #-64] // 32-byte Folded Spill Delay Slot 4 ; CHECK-NEXT: st lr, [sp, #-156] // 4-byte Folded Spill Delay Slot 3 ; CHECK-NEXT: vst amhh1, [sp, #-32] // 32-byte Folded Spill Delay Slot 2 -; CHECK-NEXT: mova r16, #2 // Delay Slot 1 +; CHECK-NEXT: mova r2, #4; movx r16, #2 // Delay Slot 1 ; CHECK-NEXT: nopb ; nopa ; nops ; eq r1, r0, r16; nopm ; nopv ; CHECK-NEXT: jnz r1, #.LBB13_5 ; CHECK-NEXT: nop // Delay Slot 5 @@ -665,8 +663,7 @@ define dso_local noundef <4 x i64> @_Z20test_extract_v4acc64Dv16_u7__acc64i(<16 ; CHECK-LABEL: _Z20test_extract_v4acc64Dv16_u7__acc64i: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #160; nopx -; CHECK-NEXT: mova r2, #4 +; CHECK-NEXT: nopa ; paddb [sp], #160; nopx ; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: st r16, [sp, #-160] // 4-byte Folded Spill ; CHECK-NEXT: vst amll1, [sp, #-128] // 32-byte Folded Spill @@ -675,7 +672,7 @@ define dso_local noundef <4 x i64> @_Z20test_extract_v4acc64Dv16_u7__acc64i(<16 ; CHECK-NEXT: vst amhl1, [sp, #-64] // 32-byte Folded Spill Delay Slot 4 ; CHECK-NEXT: st lr, [sp, #-156] // 4-byte Folded Spill Delay Slot 3 ; CHECK-NEXT: vst amhh1, [sp, #-32] // 32-byte Folded Spill Delay Slot 2 -; CHECK-NEXT: mova r16, #2 // Delay Slot 1 +; CHECK-NEXT: mova r2, #4; movx r16, #2 // Delay Slot 1 ; CHECK-NEXT: nopb ; nopa ; nops ; eq r1, r0, r16; nopm ; nopv ; CHECK-NEXT: jnz r1, #.LBB20_5 ; CHECK-NEXT: nop // Delay Slot 5 diff --git a/llvm/test/CodeGen/AIE/aie2/hardware-loops/nested.ll b/llvm/test/CodeGen/AIE/aie2/hardware-loops/nested.ll index 66977e1274ca..42b7a2cab608 100644 --- a/llvm/test/CodeGen/AIE/aie2/hardware-loops/nested.ll +++ b/llvm/test/CodeGen/AIE/aie2/hardware-loops/nested.ll @@ -20,8 +20,8 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; CHECK-NEXT: // %bb.0: // %for.cond3.preheader.lr.ph ; CHECK-NEXT: nopa ; nopb ; j #.LBB0_3 ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: mova r3, #0 // Delay Slot 4 -; CHECK-NEXT: mova r4, #2 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: mova r3, #0; movx r4, #2 // Delay Slot 3 ; CHECK-NEXT: movxm p2, #.LBB0_1 // Delay Slot 2 ; CHECK-NEXT: lda r2, [p0, #0] // Delay Slot 1 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/AIE/aie2/hardware-loops/sibling.ll b/llvm/test/CodeGen/AIE/aie2/hardware-loops/sibling.ll index 5e1501961f7a..74396c4e2ddf 100644 --- a/llvm/test/CodeGen/AIE/aie2/hardware-loops/sibling.ll +++ b/llvm/test/CodeGen/AIE/aie2/hardware-loops/sibling.ll @@ -12,11 +12,9 @@ define void @sibling(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; CHECK-LABEL: sibling: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %for.body.lr.ph -; CHECK-NEXT: mova r2, #0; nopxm -; CHECK-NEXT: add.nc r0, r0, #-1 -; CHECK-NEXT: mova r4, #2 +; CHECK-NEXT: nopa ; nopb ; nopx ; add.nc r0, r0, #-1 ; CHECK-NEXT: movxm p2, #.LBB0_1 -; CHECK-NEXT: mova r5, #0 +; CHECK-NEXT: mova r2, #0; movx r5, #0; mov r4, #2 ; CHECK-NEXT: lda r3, [p0, #0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body diff --git a/llvm/test/CodeGen/AIE/aie2/hardware-loops/simple.ll b/llvm/test/CodeGen/AIE/aie2/hardware-loops/simple.ll index 7168d4023e37..fccb281c9c64 100644 --- a/llvm/test/CodeGen/AIE/aie2/hardware-loops/simple.ll +++ b/llvm/test/CodeGen/AIE/aie2/hardware-loops/simple.ll @@ -12,9 +12,8 @@ define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; CHECK-LABEL: simple: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %for.body.lr.ph -; CHECK-NEXT: mova r2, #0; nopb ; nopxm ; nops -; CHECK-NEXT: add.nc r0, r0, #-1 -; CHECK-NEXT: mova r3, #2 +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; add.nc r0, r0, #-1; nopv +; CHECK-NEXT: mova r2, #0; movx r3, #2 ; CHECK-NEXT: movxm p2, #.LBB0_1 ; CHECK-NEXT: lda r1, [p0, #0] ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/AIE/aie2/hardware-loops/unknown-tc.ll b/llvm/test/CodeGen/AIE/aie2/hardware-loops/unknown-tc.ll index 22a430126228..e4cebabe0c0a 100644 --- a/llvm/test/CodeGen/AIE/aie2/hardware-loops/unknown-tc.ll +++ b/llvm/test/CodeGen/AIE/aie2/hardware-loops/unknown-tc.ll @@ -11,8 +11,7 @@ define void @cbz_exit(ptr %in, ptr %res) { ; CHECK-LABEL: cbz_exit: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r0, #-1; nopb ; nopxm -; CHECK-NEXT: mova r1, #2 +; CHECK-NEXT: nopb ; mova r0, #-1; nops ; movx r1, #2; nopm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -59,8 +58,7 @@ define void @cbnz_exit(ptr %in, ptr %res) { ; CHECK-LABEL: cbnz_exit: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r0, #-1; nopb ; nopxm -; CHECK-NEXT: mova r1, #2 +; CHECK-NEXT: nopb ; mova r0, #-1; nops ; movx r1, #2; nopm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_1: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AIE/aie2/intrinsics-128bit.ll b/llvm/test/CodeGen/AIE/aie2/intrinsics-128bit.ll index c112bc527753..7caf2404b168 100644 --- a/llvm/test/CodeGen/AIE/aie2/intrinsics-128bit.ll +++ b/llvm/test/CodeGen/AIE/aie2/intrinsics-128bit.ll @@ -32,9 +32,9 @@ define <32 x i16> @test_set_v32int16(i32 noundef %idx, ptr nocapture readonly % ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: vlda.128 wl0, [p0] ; CHECK-NEXT: nop -; CHECK-NEXT: mova r1, #4 +; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: mova r2, #64 // Delay Slot 5 +; CHECK-NEXT: mova r1, #4; movx r2, #64 // Delay Slot 5 ; CHECK-NEXT: lshl r0, r0, r1 // Delay Slot 4 ; CHECK-NEXT: sub r0, r2, r0 // Delay Slot 3 ; CHECK-NEXT: vshift x0, x0, x0, r0 // Delay Slot 2 @@ -55,11 +55,8 @@ define <64 x i8> @insert_128_in_512(<64 x i8> noundef %v, i32 noundef %idx, <16 ; CHECK-LABEL: insert_128_in_512: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov r5, r16; nopv -; CHECK-NEXT: mova r1, #4 -; CHECK-NEXT: mova r2, #64 -; CHECK-NEXT: mova r3, #2 -; CHECK-NEXT: mova r4, #15 +; CHECK-NEXT: nopb ; nopa ; nops ; or r5, r16, r16; mov r1, #4; nopv +; CHECK-NEXT: nopb ; mova r2, #64; nops ; movx r4, #15; mov r3, #2; nopv ; CHECK-NEXT: lshl r1, r0, r1 ; CHECK-NEXT: lshl r0, r0, r3 ; CHECK-NEXT: ret lr @@ -87,11 +84,8 @@ define dso_local noundef <32 x i8> @insert_128_in_256(<32 x i8> noundef %v, i32 ; CHECK-LABEL: insert_128_in_256: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov r5, r16; nopv -; CHECK-NEXT: mova r1, #4 -; CHECK-NEXT: mova r2, #64 -; CHECK-NEXT: mova r3, #2 -; CHECK-NEXT: mova r4, #15 +; CHECK-NEXT: nopb ; nopa ; nops ; or r5, r16, r16; mov r1, #4; nopv +; CHECK-NEXT: nopb ; mova r2, #64; nops ; movx r4, #15; mov r3, #2; nopv ; CHECK-NEXT: lshl r1, r0, r1 ; CHECK-NEXT: lshl r0, r0, r3 ; CHECK-NEXT: ret lr @@ -121,25 +115,19 @@ define <64 x i8> @test_concat_4_v32uint4(<16 x i8> noundef %v0, <16 x i8> nounde ; CHECK-LABEL: test_concat_4_v32uint4: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mov r3, r16 -; CHECK-NEXT: mov r4, r17 -; CHECK-NEXT: mov r5, r18 -; CHECK-NEXT: mova r0, #48 -; CHECK-NEXT: mova r1, #32 +; CHECK-NEXT: nopa ; or r3, r16, r16; mov r0, #48 +; CHECK-NEXT: or r5, r18, r18; mov r4, r17 ; CHECK-NEXT: movxm r16, #3840 -; CHECK-NEXT: mova r2, #16 ; CHECK-NEXT: movxm r17, #61440 -; CHECK-NEXT: mova r18, #15 +; CHECK-NEXT: mova r1, #32; movx r18, #15; mov r2, #16 ; CHECK-NEXT: vshift x0, x0, x4, r0 ; CHECK-NEXT: vshift x4, x0, x6, r1 -; CHECK-NEXT: vsel.32 x0, x0, x4, r16 -; CHECK-NEXT: mov r16, r3 ; CHECK-NEXT: ret lr -; CHECK-NEXT: vshift x4, x0, x8, r2 // Delay Slot 5 -; CHECK-NEXT: vsel.32 x0, x0, x4, r17 // Delay Slot 4 -; CHECK-NEXT: mov r17, r4 // Delay Slot 3 +; CHECK-NEXT: or r16, r3, r3; vsel.32 x0, x0, x4, r16 // Delay Slot 5 +; CHECK-NEXT: vshift x4, x0, x8, r2 // Delay Slot 4 +; CHECK-NEXT: vsel.32 x0, x0, x4, r17 // Delay Slot 3 ; CHECK-NEXT: vsel.32 x0, x0, x2, r18 // Delay Slot 2 -; CHECK-NEXT: mov r18, r5 // Delay Slot 1 +; CHECK-NEXT: or r18, r5, r5; mov r17, r4 // Delay Slot 1 entry: %0 = bitcast <16 x i8> %v1 to <4 x i32> %1 = tail call <16 x i32> @llvm.aie2.set.I512.I128(<4 x i32> %0) @@ -164,10 +152,9 @@ define <32 x i8> @test_concat_2_v32uint4(<16 x i8> noundef %v0, <16 x i8> nounde ; CHECK-LABEL: test_concat_2_v32uint4: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov r1, r16 -; CHECK-NEXT: ret lr -; CHECK-NEXT: mova r0, #48 // Delay Slot 5 -; CHECK-NEXT: mova r16, #15 // Delay Slot 4 +; CHECK-NEXT: nopa ; ret lr ; nopm +; CHECK-NEXT: mov r1, r16 // Delay Slot 5 +; CHECK-NEXT: mova r0, #48; movx r16, #15 // Delay Slot 4 ; CHECK-NEXT: vshift x0, x0, x4, r0 // Delay Slot 3 ; CHECK-NEXT: vsel.32 x0, x0, x2, r16 // Delay Slot 2 ; CHECK-NEXT: mov r16, r1 // Delay Slot 1 diff --git a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll index b51e151ed882..27ae9e4d660c 100644 --- a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll +++ b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll @@ -179,9 +179,9 @@ define i32 @test_extract_elem(<8 x i32> noundef %a, i32 noundef %idx) { ; CHECK-LABEL: test_extract_elem: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops -; CHECK-NEXT: mov r2, r16 // Delay Slot 5 -; CHECK-NEXT: mov r16, r1 // Delay Slot 4 +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: or r16, r1, r1; mov r2, r16 // Delay Slot 4 ; CHECK-NEXT: vextract.s32 r0, x0, r16 // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: mov r16, r2 // Delay Slot 1 diff --git a/llvm/test/CodeGen/AIE/aie2/ld_128.ll b/llvm/test/CodeGen/AIE/aie2/ld_128.ll index 1f0ed5b2f6bf..f09a3eba1284 100644 --- a/llvm/test/CodeGen/AIE/aie2/ld_128.ll +++ b/llvm/test/CodeGen/AIE/aie2/ld_128.ll @@ -73,9 +73,8 @@ define dso_local noundef <8 x i32> @test4() { ; CHECK-LABEL: test4: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32; nopa ; nops ; nopxm ; nopv -; CHECK-NEXT: mova dj0, #512 -; CHECK-NEXT: mov p0, sp +; CHECK-NEXT: nopb ; padda [sp], #32; nops ; movxm dj0, #512; nopv +; CHECK-NEXT: nopx ; mov p0, sp ; CHECK-NEXT: paddb [p0], #-32 ; CHECK-NEXT: vldb.128 wl0, [p0, dj0] ; CHECK-NEXT: ret lr diff --git a/llvm/test/CodeGen/AIE/aie2/memcalls.ll b/llvm/test/CodeGen/AIE/aie2/memcalls.ll index c1021232e7da..3bdc2255106d 100644 --- a/llvm/test/CodeGen/AIE/aie2/memcalls.ll +++ b/llvm/test/CodeGen/AIE/aie2/memcalls.ll @@ -68,10 +68,10 @@ define void @callmemset(ptr %p) { ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopa ; nopb ; jl #memset -; CHECK-NEXT: mova r0, #42 // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: paddb [sp], #32 // Delay Slot 4 ; CHECK-NEXT: st lr, [sp, #-32] // 4-byte Folded Spill Delay Slot 3 -; CHECK-NEXT: mova r1, #40 // Delay Slot 2 +; CHECK-NEXT: mova r0, #42; movx r1, #40 // Delay Slot 2 ; CHECK-NEXT: mov p1, p0 // Delay Slot 1 ; CHECK-NEXT: lda lr, [sp, #-32] // 4-byte Folded Reload ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/odd-stackoffset.ll b/llvm/test/CodeGen/AIE/aie2/odd-stackoffset.ll index 4b72acceaabf..d088f9a4839c 100644 --- a/llvm/test/CodeGen/AIE/aie2/odd-stackoffset.ll +++ b/llvm/test/CodeGen/AIE/aie2/odd-stackoffset.ll @@ -14,14 +14,13 @@ define void @f() { ; CHECK-LABEL: f: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32; nopa ; nops ; nopxm ; nopv -; CHECK-NEXT: mova m0, #-27; nopx +; CHECK-NEXT: paddb [sp], #32 ; CHECK-NEXT: jl #f0 -; CHECK-NEXT: mov p0, sp // Delay Slot 5 -; CHECK-NEXT: mov p1, sp // Delay Slot 4 +; CHECK-NEXT: mova m0, #-27 // Delay Slot 5 +; CHECK-NEXT: mov p0, sp // Delay Slot 4 ; CHECK-NEXT: st lr, [sp, #-32] // 4-byte Folded Spill Delay Slot 3 -; CHECK-NEXT: paddb [p0], #-28 // Delay Slot 2 -; CHECK-NEXT: paddb [p1], m0 // Delay Slot 1 +; CHECK-NEXT: mov p1, sp // Delay Slot 2 +; CHECK-NEXT: paddb [p0], #-28; padds [p1], m0 // Delay Slot 1 ; CHECK-NEXT: lda lr, [sp, #-32] // 4-byte Folded Reload ; CHECK-NEXT: nop ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/instruction_mutation/rescheduling.mir b/llvm/test/CodeGen/AIE/aie2/schedule/instruction_mutation/rescheduling.mir new file mode 100644 index 000000000000..bb10631c6a3b --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/instruction_mutation/rescheduling.mir @@ -0,0 +1,141 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -march=aie2 -run-pass=postmisched --aie-instruction-mutation=true --verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ON +# RUN: llc -march=aie2 -run-pass=postmisched --aie-instruction-mutation=false --verify-machineinstrs %s -o - | FileCheck %s --check-prefix=OFF + +--- +name: check_rescheduling_1 +alignment: 16 +body: | + bb.0.entry: + ; ON-LABEL: name: check_rescheduling_1 + ; ON: BUNDLE implicit-def $r1, implicit-def $p1, implicit killed $p1 { + ; ON-NEXT: $r1 = LDA_TM killed $p1 :: (load (s32) from custom "TileMemory", addrspace 15) + ; ON-NEXT: $p1 = MOV_mv_cg 1 + ; ON-NEXT: } + ; ON-NEXT: $r2 = LDA_TM killed $p1 :: (load (s32) from custom "TileMemory", addrspace 15) + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; + ; OFF-LABEL: name: check_rescheduling_1 + ; OFF: $r1 = LDA_TM killed $p1 :: (load (s32) from custom "TileMemory", addrspace 15) + ; OFF-NEXT: $p1 = MOVA_lda_cg 1 + ; OFF-NEXT: $r2 = LDA_TM killed $p1 :: (load (s32) from custom "TileMemory", addrspace 15) + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + $r1 = LDA_TM $p1 :: (load (s32) from custom "TileMemory") + $p1 = MOV_PD_imm10_pseudo 1 + $r2 = LDA_TM $p1 :: (load (s32) from custom "TileMemory") +... + + +--- +name: check_rescheduling_2 +alignment: 16 +body: | + bb.0.entry: + ; ON-LABEL: name: check_rescheduling_2 + ; ON: BUNDLE implicit-def $wh1, implicit-def $p1, implicit killed $p1 { + ; ON-NEXT: $wh1 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 + ; ON-NEXT: $p1 = MOV_mv_cg 1 + ; ON-NEXT: } + ; ON-NEXT: $r2 = LDA_TM killed $p1 :: (load (s32) from custom "TileMemory", addrspace 15) + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; + ; OFF-LABEL: name: check_rescheduling_2 + ; OFF: $wh1 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 + ; OFF-NEXT: $p1 = MOVA_lda_cg 1 + ; OFF-NEXT: $r2 = LDA_TM killed $p1 :: (load (s32) from custom "TileMemory", addrspace 15) + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + $wh1 = VLDA_dmw_lda_w_ag_idx_imm $p1, 0 + $p1 = MOV_PD_imm10_pseudo 1 + $r2 = LDA_TM $p1 :: (load (s32) from custom "TileMemory") +... + +--- +name: check_rescheduling_3 +alignment: 16 +body: | + bb.0.entry: + ; ON-LABEL: name: check_rescheduling_3 + ; ON: BUNDLE implicit-def $wh1, implicit-def $x7, implicit-def $wl7, implicit-def $wh7, implicit killed $p1, implicit killed $wl7 { + ; ON-NEXT: $wh1 = VLDA_dmw_lda_w_ag_idx_imm killed $p1, 0 + ; ON-NEXT: $x7 = VUNPACK_S8_S4 killed $wl7 + ; ON-NEXT: } + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; ON-NEXT: NOP + ; + ; OFF-LABEL: name: check_rescheduling_3 + ; OFF: $x7 = VUNPACK_S8_S4 killed $wl7 + ; OFF-NEXT: $wh1 = VLDB_dmw_ldb_ag_idx_imm killed $p1, 0 + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + ; OFF-NEXT: NOP + $x7 = VUNPACK_S8_S4 $wl7 + $wh1 = VLD_idx_imm_3x32_pseudo $p1, 0 +... + +--- +name: check_rescheduling_4 +alignment: 16 +body: | + bb.0.entry: + ; ON-LABEL: name: check_rescheduling_4 + ; ON: BUNDLE implicit-def $p0, implicit-def $r0 { + ; ON-NEXT: $p0 = MOVA_lda_cg 10 + ; ON-NEXT: $r0 = MOVX_alu_cg 10 + ; ON-NEXT: } + ; + ; OFF-LABEL: name: check_rescheduling_4 + ; OFF: $p0 = MOVA_lda_cg 10 + ; OFF-NEXT: $r0 = MOVA_lda_cg 10 + $p0 = MOVA_lda_cg 10 + $r0 = MOV_RLC_imm10_pseudo 10 +... + +--- +name: check_rescheduling_multi_slot_mov_or_mov_reverse +alignment: 16 +body: | + bb.0.entry: + ; ON-LABEL: name: check_rescheduling_multi_slot_mov_or_mov_reverse + ; ON: BUNDLE implicit-def $r2, implicit-def $r7, implicit killed $r1, implicit killed $r6 { + ; ON-NEXT: $r2 = MOV_OR killed $r1 + ; ON-NEXT: $r7 = MOV_mv_scl killed $r6 + ; ON-NEXT: } + ; + ; OFF-LABEL: name: check_rescheduling_multi_slot_mov_or_mov_reverse + ; OFF: $r7 = MOV_mv_scl killed $r6 + ; OFF-NEXT: $r2 = MOV_mv_scl killed $r1 + $r7 = MOV_mv_scl $r6 + $r2 = MOV_SCL_pseudo $r1 +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/negative_latencies/load_accumulate.mir b/llvm/test/CodeGen/AIE/aie2/schedule/negative_latencies/load_accumulate.mir index ad76b8843308..2c4bc6617ed9 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/negative_latencies/load_accumulate.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/negative_latencies/load_accumulate.mir @@ -24,20 +24,26 @@ body: | ; CHECK: liveins: $p0, $p1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $dj0 = MOVA_lda_cg 14 - ; CHECK-NEXT: $r3 = LDA_S16_ag_idx $p0, killed $dj0 - ; CHECK-NEXT: $dj0 = MOVA_lda_cg 16 + ; CHECK-NEXT: BUNDLE implicit-def $r3, implicit-def $dj0, implicit $p0, implicit killed $dj0 { + ; CHECK-NEXT: $r3 = LDA_S16_ag_idx $p0, killed $dj0 + ; CHECK-NEXT: $dj0 = MOV_mv_cg 16 + ; CHECK-NEXT: } ; CHECK-NEXT: $r4 = LDA_S16_ag_idx $p0, killed $dj0 ; CHECK-NEXT: $dj0 = MOVA_lda_cg 18 - ; CHECK-NEXT: $r4 = LDA_S16_ag_idx $p0, killed $dj0 - ; CHECK-NEXT: $dj0 = MOVA_lda_cg 20 - ; CHECK-NEXT: $r4 = LDA_S16_ag_idx $p0, killed $dj0 - ; CHECK-NEXT: $dj0 = MOVA_lda_cg 22 + ; CHECK-NEXT: BUNDLE implicit-def $r4, implicit-def $dj0, implicit $p0, implicit killed $dj0 { + ; CHECK-NEXT: $r4 = LDA_S16_ag_idx $p0, killed $dj0 + ; CHECK-NEXT: $dj0 = MOV_mv_cg 20 + ; CHECK-NEXT: } + ; CHECK-NEXT: BUNDLE implicit-def $r4, implicit-def $dj0, implicit $p0, implicit killed $dj0 { + ; CHECK-NEXT: $r4 = LDA_S16_ag_idx $p0, killed $dj0 + ; CHECK-NEXT: $dj0 = MOV_mv_cg 22 + ; CHECK-NEXT: } ; CHECK-NEXT: $r4 = LDA_S16_ag_idx killed $p0, killed $dj0 ; CHECK-NEXT: NOP - ; CHECK-NEXT: $r3 = ADD killed $r3, $r4, implicit-def $srcarry + ; CHECK-NEXT: NOP ; CHECK-NEXT: $r3 = ADD killed $r3, $r4, implicit-def $srcarry ; CHECK-NEXT: RET implicit $lr - ; CHECK-NEXT: NOP + ; CHECK-NEXT: $r3 = ADD killed $r3, $r4, implicit-def $srcarry ; CHECK-NEXT: $r3 = ADD killed $r3, $r4, implicit-def $srcarry ; CHECK-NEXT: $r3 = ADD killed $r3, killed $r4, implicit-def $srcarry ; CHECK-NEXT: ST_dms_sts_idx_imm killed $r3, killed $p1, 0 diff --git a/llvm/test/CodeGen/AIE/aie2/scl2vec-bfloat16.ll b/llvm/test/CodeGen/AIE/aie2/scl2vec-bfloat16.ll index 858b7423f4c1..d3a4c15737ca 100644 --- a/llvm/test/CodeGen/AIE/aie2/scl2vec-bfloat16.ll +++ b/llvm/test/CodeGen/AIE/aie2/scl2vec-bfloat16.ll @@ -79,10 +79,10 @@ define dso_local noundef <32 x bfloat> @_Z11test_insertDv32_u6__bf16iy(<32 x bfl ; CHECK-LABEL: _Z11test_insertDv32_u6__bf16iy: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops -; CHECK-NEXT: mov r24, r1 // Delay Slot 5 -; CHECK-NEXT: mov r29, r0 // Delay Slot 4 -; CHECK-NEXT: mov r25, r2 // Delay Slot 3 +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: mov r24, r1 // Delay Slot 4 +; CHECK-NEXT: or r25, r2, r2; mov r29, r0 // Delay Slot 3 ; CHECK-NEXT: vinsert.64 x0, x2, r29, r25:r24 // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -194,14 +194,12 @@ define dso_local %class.bfloat16 @_Z13test_ext_elemDv32_u6__bf16ii(<32 x bfloat> ; CHECK-LABEL: _Z13test_ext_elemDv32_u6__bf16ii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; mov r3, r16 -; CHECK-NEXT: mov r16, r1 -; CHECK-NEXT: ret lr -; CHECK-NEXT: mov crVaddSign, r2 // Delay Slot 5 -; CHECK-NEXT: vextract.d16 r0, x0, r16 // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r3 // Delay Slot 1 +; CHECK-NEXT: nopa ; ret lr +; CHECK-NEXT: or r16, r1, r1; mov r3, r16 // Delay Slot 5 +; CHECK-NEXT: mov crVaddSign, r2 // Delay Slot 4 +; CHECK-NEXT: vextract.d16 r0, x0, r16 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: or r16, r3, r3; mov crVaddSign, #0 // Delay Slot 1 entry: %0 = bitcast <32 x bfloat> %v to <32 x i16> %1 = tail call i32 @llvm.aie2.vextract.elem16.I512(<32 x i16> %0, i32 %idx, i32 %sign) @@ -215,14 +213,12 @@ define dso_local noundef <2 x bfloat> @_Z23test_extract_v2bfloat16Dv32_u6__bf16i ; CHECK-LABEL: _Z23test_extract_v2bfloat16Dv32_u6__bf16ii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; mov r3, r16 -; CHECK-NEXT: mov r16, r1 -; CHECK-NEXT: ret lr -; CHECK-NEXT: mov crVaddSign, r2 // Delay Slot 5 -; CHECK-NEXT: vextract.d32 r0, x0, r16 // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r3 // Delay Slot 1 +; CHECK-NEXT: nopa ; ret lr +; CHECK-NEXT: or r16, r1, r1; mov r3, r16 // Delay Slot 5 +; CHECK-NEXT: mov crVaddSign, r2 // Delay Slot 4 +; CHECK-NEXT: vextract.d32 r0, x0, r16 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: or r16, r3, r3; mov crVaddSign, #0 // Delay Slot 1 entry: %0 = bitcast <32 x bfloat> %v to <16 x i32> %1 = tail call i32 @llvm.aie2.vextract.elem32.I512(<16 x i32> %0, i32 %idx, i32 %sign) @@ -234,16 +230,13 @@ define dso_local noundef i64 @_Z12test_ext_u64Dv32_u6__bf16ii(<32 x bfloat> noun ; CHECK-LABEL: _Z12test_ext_u64Dv32_u6__bf16ii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; nopx ; mov r4, r16; nops -; CHECK-NEXT: mov r16, r2 -; CHECK-NEXT: mov crVaddSign, r3 -; CHECK-NEXT: vextract.d64 r25:r24, x0, r16 -; CHECK-NEXT: ret lr -; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 4 -; CHECK-NEXT: mov r16, r4 // Delay Slot 3 -; CHECK-NEXT: mov r0, r24 // Delay Slot 2 -; CHECK-NEXT: mov r1, r25 // Delay Slot 1 +; CHECK-NEXT: nopb ; nopa ; nops ; or r16, r2, r2; mov r4, r16; nopv +; CHECK-NEXT: nopa ; ret lr +; CHECK-NEXT: mov crVaddSign, r3 // Delay Slot 5 +; CHECK-NEXT: vextract.d64 r25:r24, x0, r16 // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: or r16, r4, r4; mov crVaddSign, #0 // Delay Slot 2 +; CHECK-NEXT: or r1, r25, r25; mov r0, r24 // Delay Slot 1 entry: %0 = bitcast <32 x bfloat> %v to <16 x i32> %1 = tail call <2 x i32> @llvm.aie2.vextract.elem64.I512(<16 x i32> %0, i32 %idx, i32 %sign) diff --git a/llvm/test/CodeGen/AIE/aie2/scl2vec-float.ll b/llvm/test/CodeGen/AIE/aie2/scl2vec-float.ll index 71b29841b7a4..0ab47cd30683 100644 --- a/llvm/test/CodeGen/AIE/aie2/scl2vec-float.ll +++ b/llvm/test/CodeGen/AIE/aie2/scl2vec-float.ll @@ -28,14 +28,12 @@ define float @test_extract_elem_floatv16(<16 x float> %v, i32 %idx, i32 %sign) { ; CHECK-LABEL: test_extract_elem_floatv16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; mov r3, r16 -; CHECK-NEXT: mov r16, r1 -; CHECK-NEXT: ret lr -; CHECK-NEXT: mov crVaddSign, r2 // Delay Slot 5 -; CHECK-NEXT: vextract.d32 r0, x0, r16 // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r3 // Delay Slot 1 +; CHECK-NEXT: nopa ; ret lr +; CHECK-NEXT: or r16, r1, r1; mov r3, r16 // Delay Slot 5 +; CHECK-NEXT: mov crVaddSign, r2 // Delay Slot 4 +; CHECK-NEXT: vextract.d32 r0, x0, r16 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: or r16, r3, r3; mov crVaddSign, #0 // Delay Slot 1 entry: %0 = bitcast <16 x float> %v to <16 x i32> %1 = tail call i32 @llvm.aie2.vextract.elem32.I512(<16 x i32> %0, i32 %idx, i32 %sign) diff --git a/llvm/test/CodeGen/AIE/aie2/set.ll b/llvm/test/CodeGen/AIE/aie2/set.ll index 6cc8405f48f6..1f8f5dd0b5ee 100644 --- a/llvm/test/CodeGen/AIE/aie2/set.ll +++ b/llvm/test/CodeGen/AIE/aie2/set.ll @@ -72,15 +72,14 @@ define dso_local noundef <128 x i8> @_Z27test_set_v256uint4_1024_256iDv32_DU8_(i ; CHECK-LABEL: _Z27test_set_v256uint4_1024_256iDv32_DU8_: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; paddb [sp], #160; nopxm ; nops -; CHECK-NEXT: mova r2, #4 +; CHECK-NEXT: paddb [sp], #160; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: jl #__modsi3 ; CHECK-NEXT: vmov wh5, wl0 // Delay Slot 5 ; CHECK-NEXT: st r16, [sp, #-160] // 4-byte Folded Spill Delay Slot 4 ; CHECK-NEXT: st lr, [sp, #-156] // 4-byte Folded Spill Delay Slot 3 ; CHECK-NEXT: vst wh5, [sp, #-32] // 32-byte Folded Spill Delay Slot 2 -; CHECK-NEXT: mova r16, #2 // Delay Slot 1 +; CHECK-NEXT: mova r2, #4; movx r16, #2 // Delay Slot 1 ; CHECK-NEXT: nopb ; nopa ; nops ; eq r1, r0, r16; nopm ; nopv ; CHECK-NEXT: jnz r1, #.LBB2_5 ; CHECK-NEXT: nop // Delay Slot 5 @@ -378,15 +377,14 @@ define dso_local noundef <16 x i64> @_Z17test_set_v32acc32iDv8_u7__acc32(i32 nou ; CHECK-LABEL: _Z17test_set_v32acc32iDv8_u7__acc32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; paddb [sp], #160; nopxm ; nops -; CHECK-NEXT: mova r2, #4 +; CHECK-NEXT: paddb [sp], #160; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: jl #__modsi3 ; CHECK-NEXT: vmov amhl0, amll1 // Delay Slot 5 ; CHECK-NEXT: st r16, [sp, #-160] // 4-byte Folded Spill Delay Slot 4 ; CHECK-NEXT: st lr, [sp, #-156] // 4-byte Folded Spill Delay Slot 3 ; CHECK-NEXT: vst amhl0, [sp, #-64] // 32-byte Folded Spill Delay Slot 2 -; CHECK-NEXT: mova r16, #2 // Delay Slot 1 +; CHECK-NEXT: mova r2, #4; movx r16, #2 // Delay Slot 1 ; CHECK-NEXT: nopb ; nopa ; nops ; eq r1, r0, r16; nopm ; nopv ; CHECK-NEXT: jnz r1, #.LBB10_5 ; CHECK-NEXT: nop // Delay Slot 5 @@ -561,15 +559,14 @@ define dso_local noundef <16 x i64> @_Z17test_set_v16acc64iDv4_u7__acc64(i32 nou ; CHECK-LABEL: _Z17test_set_v16acc64iDv4_u7__acc64: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; paddb [sp], #160; nopxm ; nops -; CHECK-NEXT: mova r2, #4 +; CHECK-NEXT: paddb [sp], #160; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: jl #__modsi3 ; CHECK-NEXT: vmov amhl0, amll1 // Delay Slot 5 ; CHECK-NEXT: st r16, [sp, #-160] // 4-byte Folded Spill Delay Slot 4 ; CHECK-NEXT: st lr, [sp, #-156] // 4-byte Folded Spill Delay Slot 3 ; CHECK-NEXT: vst amhl0, [sp, #-64] // 32-byte Folded Spill Delay Slot 2 -; CHECK-NEXT: mova r16, #2 // Delay Slot 1 +; CHECK-NEXT: mova r2, #4; movx r16, #2 // Delay Slot 1 ; CHECK-NEXT: nopb ; nopa ; nops ; eq r1, r0, r16; nopm ; nopv ; CHECK-NEXT: jnz r1, #.LBB14_5 ; CHECK-NEXT: nop // Delay Slot 5 diff --git a/llvm/test/CodeGen/AIE/aie2/spill_to_reg_ext.ll b/llvm/test/CodeGen/AIE/aie2/spill_to_reg_ext.ll index 83521ee69a1c..91fcf75c0f51 100644 --- a/llvm/test/CodeGen/AIE/aie2/spill_to_reg_ext.ll +++ b/llvm/test/CodeGen/AIE/aie2/spill_to_reg_ext.ll @@ -26,15 +26,13 @@ define dso_local noundef <16 x i32> @_Z17test_max_v16int32Dv16_iS_b(<16 x i32> n ; CHECK-LABEL: _Z17test_max_v16int32Dv16_iS_b: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mov r1, r16 -; CHECK-NEXT: mov crVaddSign, r0 -; CHECK-NEXT: vmax_lt.d32 x0, r16, x2, x4 +; CHECK-NEXT: nopa ; mov r1, r16 ; CHECK-NEXT: ret lr -; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 4 +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vmax_lt.d32 x0, r16, x2, x4 // Delay Slot 4 ; CHECK-NEXT: vmax_lt.s32 x0, r16, x0, x4 // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i.i = zext i1 %sgn to i32 %0 = tail call { <16 x i32>, i32 } @llvm.aie2.vmax.lt32(<16 x i32> %a, <16 x i32> %b, i32 %conv.i.i) diff --git a/llvm/test/CodeGen/AIE/aie2/vabs_gtz.ll b/llvm/test/CodeGen/AIE/aie2/vabs_gtz.ll index a06857d4bd65..fb49f70a664b 100644 --- a/llvm/test/CodeGen/AIE/aie2/vabs_gtz.ll +++ b/llvm/test/CodeGen/AIE/aie2/vabs_gtz.ll @@ -67,13 +67,12 @@ define dso_local noundef <32 x i16> @_Z22test_abs_gtz_v32uint16Dv32_tbRj(<32 x i ; CHECK-LABEL: _Z22test_abs_gtz_v32uint16Dv32_tbRj: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov crVaddSign, r0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vabs_gtz.d16 x0, r16, x2 // Delay Slot 5 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: nopa ; nopb ; ret lr +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vabs_gtz.d16 x0, r16, x2 // Delay Slot 4 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <32 x i16>, i32 } @llvm.aie2.vabs.gtz16(<32 x i16> %a, i32 %conv.i) @@ -123,13 +122,12 @@ define dso_local noundef <16 x i32> @_Z21test_abs_gtz_v16int32Dv16_ibRj(<16 x i3 ; CHECK-LABEL: _Z21test_abs_gtz_v16int32Dv16_ibRj: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov crVaddSign, r0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vabs_gtz.d32 x0, r16, x2 // Delay Slot 5 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: nopa ; nopb ; ret lr +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vabs_gtz.d32 x0, r16, x2 // Delay Slot 4 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <16 x i32>, i32 } @llvm.aie2.vabs.gtz32(<16 x i32> %a, i32 %conv.i) diff --git a/llvm/test/CodeGen/AIE/aie2/vaddmac.ll b/llvm/test/CodeGen/AIE/aie2/vaddmac.ll index ab5a70ae1a34..7078248d85d7 100644 --- a/llvm/test/CodeGen/AIE/aie2/vaddmac.ll +++ b/llvm/test/CodeGen/AIE/aie2/vaddmac.ll @@ -11,8 +11,7 @@ define dso_local noundef <16 x i64> @test_addmac_acc32(<64 x i8> noundef %a, i32 ; CHECK-LABEL: test_addmac_acc32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r2, #9; nopb ; nopxm -; CHECK-NEXT: mova r3, #8 +; CHECK-NEXT: nopb ; mova r2, #9; nops ; movx r3, #8; nopm ; nopv ; CHECK-NEXT: vmov cm0, cm1 ; CHECK-NEXT: lshl r0, r0, r2 ; CHECK-NEXT: lshl r1, r1, r3 @@ -36,8 +35,7 @@ define dso_local noundef <16 x i64> @test_addmsc_acc32(<64 x i8> noundef %a, i32 ; CHECK-LABEL: test_addmsc_acc32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r2, #9; nopb ; nopxm -; CHECK-NEXT: mova r3, #8 +; CHECK-NEXT: nopb ; mova r2, #9; nops ; movx r3, #8; nopm ; nopv ; CHECK-NEXT: vmov cm0, cm1 ; CHECK-NEXT: lshl r0, r0, r2 ; CHECK-NEXT: lshl r1, r1, r3 @@ -61,8 +59,7 @@ define dso_local noundef <16 x i64> @test_submac_acc32(<64 x i8> noundef %a, i32 ; CHECK-LABEL: test_submac_acc32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r2, #9; nopb ; nopxm -; CHECK-NEXT: mova r3, #8 +; CHECK-NEXT: nopb ; mova r2, #9; nops ; movx r3, #8; nopm ; nopv ; CHECK-NEXT: vmov cm0, cm1 ; CHECK-NEXT: lshl r0, r0, r2 ; CHECK-NEXT: lshl r1, r1, r3 @@ -86,8 +83,7 @@ define dso_local noundef <16 x i64> @test_submsc_acc32(<64 x i8> noundef %a, i32 ; CHECK-LABEL: test_submsc_acc32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r2, #9; nopb ; nopxm -; CHECK-NEXT: mova r3, #8 +; CHECK-NEXT: nopb ; mova r2, #9; nops ; movx r3, #8; nopm ; nopv ; CHECK-NEXT: vmov cm0, cm1 ; CHECK-NEXT: lshl r0, r0, r2 ; CHECK-NEXT: lshl r1, r1, r3 @@ -111,9 +107,7 @@ define dso_local noundef <16 x i64> @test_addmac_acc64(<32 x i16> noundef %a, i3 ; CHECK-LABEL: test_addmac_acc64: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r2, #9 -; CHECK-NEXT: mova r3, #8 -; CHECK-NEXT: mova r4, #18 +; CHECK-NEXT: mova r2, #9; nopb ; movx r4, #18; mov r3, #8 ; CHECK-NEXT: vmov cm0, cm1 ; CHECK-NEXT: lshl r0, r0, r2 ; CHECK-NEXT: lshl r1, r1, r3 @@ -140,9 +134,7 @@ define dso_local noundef <16 x i64> @test_addmsc_acc64(<32 x i16> noundef %a, i3 ; CHECK-LABEL: test_addmsc_acc64: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r2, #9 -; CHECK-NEXT: mova r3, #8 -; CHECK-NEXT: mova r4, #18 +; CHECK-NEXT: mova r2, #9; nopb ; movx r4, #18; mov r3, #8 ; CHECK-NEXT: vmov cm0, cm1 ; CHECK-NEXT: lshl r0, r0, r2 ; CHECK-NEXT: lshl r1, r1, r3 @@ -169,9 +161,7 @@ define dso_local noundef <16 x i64> @test_subadd_acc64(<32 x i16> noundef %a, i3 ; CHECK-LABEL: test_subadd_acc64: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r2, #9 -; CHECK-NEXT: mova r3, #8 -; CHECK-NEXT: mova r4, #18 +; CHECK-NEXT: mova r2, #9; nopb ; movx r4, #18; mov r3, #8 ; CHECK-NEXT: vmov cm0, cm1 ; CHECK-NEXT: lshl r0, r0, r2 ; CHECK-NEXT: lshl r1, r1, r3 @@ -198,9 +188,7 @@ define dso_local noundef <16 x i64> @test_submsc_acc64(<32 x i16> noundef %a, i3 ; CHECK-LABEL: test_submsc_acc64: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r2, #9 -; CHECK-NEXT: mova r3, #8 -; CHECK-NEXT: mova r4, #18 +; CHECK-NEXT: mova r2, #9; nopb ; movx r4, #18; mov r3, #8 ; CHECK-NEXT: vmov cm0, cm1 ; CHECK-NEXT: lshl r0, r0, r2 ; CHECK-NEXT: lshl r1, r1, r3 diff --git a/llvm/test/CodeGen/AIE/aie2/vcompare.ll b/llvm/test/CodeGen/AIE/aie2/vcompare.ll index e00d2690e2e2..82b04b828342 100644 --- a/llvm/test/CodeGen/AIE/aie2/vcompare.ll +++ b/llvm/test/CodeGen/AIE/aie2/vcompare.ll @@ -11,12 +11,12 @@ define i64 @test_ge_v64uint8(<64 x i8> %a, <64 x i8> %b) { ; CHECK-LABEL: test_ge_v64uint8: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vge.d8 r25:r24, x0, x2 // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov r0, r24 // Delay Slot 2 -; CHECK-NEXT: mov r1, r25 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vge.d8 r25:r24, x0, x2 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: or r1, r25, r25; mov r0, r24 // Delay Slot 1 entry: %0 = tail call <2 x i32> @llvm.aie2.vge8(<64 x i8> %a, <64 x i8> %b, i32 0) %1 = bitcast <2 x i32> %0 to i64 @@ -27,12 +27,12 @@ define i64 @test_ge_v64int8(<64 x i8> %a, <64 x i8> %b) { ; CHECK-LABEL: test_ge_v64int8: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vge.s8 r25:r24, x0, x2 // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov r0, r24 // Delay Slot 2 -; CHECK-NEXT: mov r1, r25 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vge.s8 r25:r24, x0, x2 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: or r1, r25, r25; mov r0, r24 // Delay Slot 1 entry: %0 = tail call <2 x i32> @llvm.aie2.vge8(<64 x i8> %a, <64 x i8> %b, i32 1) %1 = bitcast <2 x i32> %0 to i64 @@ -43,12 +43,12 @@ define i32 @test_ge_v32uint16(<32 x i16> %a, <32 x i16> %b) { ; CHECK-LABEL: test_ge_v32uint16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vge.d16 r16, x0, x2 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vge.d16 r16, x0, x2 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.vge16(<32 x i16> %a, <32 x i16> %b, i32 0) ret i32 %0 @@ -58,12 +58,12 @@ define i32 @test_ge_v32int16(<32 x i16> %a, <32 x i16> %b) { ; CHECK-LABEL: test_ge_v32int16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vge.s16 r16, x0, x2 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vge.s16 r16, x0, x2 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.vge16(<32 x i16> %a, <32 x i16> %b, i32 1) ret i32 %0 @@ -73,12 +73,12 @@ define i32 @test_ge_v16uint32(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: test_ge_v16uint32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vge.d32 r16, x0, x2 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vge.d32 r16, x0, x2 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.vge32(<16 x i32> %a, <16 x i32> %b, i32 0) ret i32 %0 @@ -88,12 +88,12 @@ define i32 @test_ge_v16int32(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: test_ge_v16int32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vge.s32 r16, x0, x2 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vge.s32 r16, x0, x2 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.vge32(<16 x i32> %a, <16 x i32> %b, i32 1) ret i32 %0 @@ -103,13 +103,12 @@ define i64 @test_ge_v64int8_sgn(<64 x i8> %a, <64 x i8> %b, i1 zeroext %sgn) { ; CHECK-LABEL: test_ge_v64int8_sgn: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopx ; mov crVaddSign, r2 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vge.d8 r25:r24, x0, x2 // Delay Slot 5 -; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 -; CHECK-NEXT: mov r0, r24 // Delay Slot 2 -; CHECK-NEXT: mov r1, r25 // Delay Slot 1 +; CHECK-NEXT: nopa ; ret lr ; nopm +; CHECK-NEXT: mov crVaddSign, r2 // Delay Slot 5 +; CHECK-NEXT: vge.d8 r25:r24, x0, x2 // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 +; CHECK-NEXT: or r1, r25, r25; mov r0, r24 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call <2 x i32> @llvm.aie2.vge8(<64 x i8> %a, <64 x i8> %b, i32 %conv.i) @@ -121,13 +120,12 @@ define i32 @test_ge_v32uint16_sgn(<32 x i16> %a, <32 x i16> %b, i1 zeroext %sgn) ; CHECK-LABEL: test_ge_v32uint16_sgn: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov crVaddSign, r1 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vge.d16 r16, x0, x2 // Delay Slot 5 -; CHECK-NEXT: or r2, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r2 // Delay Slot 1 +; CHECK-NEXT: nopa ; nopb ; ret lr +; CHECK-NEXT: mov crVaddSign, r1 // Delay Slot 5 +; CHECK-NEXT: vge.d16 r16, x0, x2 // Delay Slot 4 +; CHECK-NEXT: or r2, r16, r16 // Delay Slot 3 +; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 +; CHECK-NEXT: or r16, r2, r2; mov r0, r16 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call i32 @llvm.aie2.vge16(<32 x i16> %a, <32 x i16> %b, i32 %conv.i) @@ -138,13 +136,12 @@ define i32 @test_ge_v16int32_sgn(<16 x i32> %a, <16 x i32> %b, i1 zeroext %sgn) ; CHECK-LABEL: test_ge_v16int32_sgn: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov crVaddSign, r1 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vge.d32 r16, x0, x2 // Delay Slot 5 -; CHECK-NEXT: or r2, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r2 // Delay Slot 1 +; CHECK-NEXT: nopa ; nopb ; ret lr +; CHECK-NEXT: mov crVaddSign, r1 // Delay Slot 5 +; CHECK-NEXT: vge.d32 r16, x0, x2 // Delay Slot 4 +; CHECK-NEXT: or r2, r16, r16 // Delay Slot 3 +; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 +; CHECK-NEXT: or r16, r2, r2; mov r0, r16 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call i32 @llvm.aie2.vge32(<16 x i32> %a, <16 x i32> %b, i32 %conv.i) @@ -155,12 +152,12 @@ define i64 @test_lt_v64uint8(<64 x i8> %a, <64 x i8> %b) { ; CHECK-LABEL: test_lt_v64uint8: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vlt.d8 r25:r24, x0, x2 // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov r0, r24 // Delay Slot 2 -; CHECK-NEXT: mov r1, r25 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vlt.d8 r25:r24, x0, x2 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: or r1, r25, r25; mov r0, r24 // Delay Slot 1 entry: %0 = tail call <2 x i32> @llvm.aie2.vlt8(<64 x i8> %a, <64 x i8> %b, i32 0) %1 = bitcast <2 x i32> %0 to i64 @@ -171,12 +168,12 @@ define i64 @test_lt_v64int8(<64 x i8> %a, <64 x i8> %b) { ; CHECK-LABEL: test_lt_v64int8: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vlt.s8 r25:r24, x0, x2 // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov r0, r24 // Delay Slot 2 -; CHECK-NEXT: mov r1, r25 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vlt.s8 r25:r24, x0, x2 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: or r1, r25, r25; mov r0, r24 // Delay Slot 1 entry: %0 = tail call <2 x i32> @llvm.aie2.vlt8(<64 x i8> %a, <64 x i8> %b, i32 1) %1 = bitcast <2 x i32> %0 to i64 @@ -187,12 +184,12 @@ define i32 @test_lt_v32uint16(<32 x i16> %a, <32 x i16> %b) { ; CHECK-LABEL: test_lt_v32uint16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vlt.d16 r16, x0, x2 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vlt.d16 r16, x0, x2 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.vlt16(<32 x i16> %a, <32 x i16> %b, i32 0) ret i32 %0 @@ -202,12 +199,12 @@ define i32 @test_lt_v32int16(<32 x i16> %a, <32 x i16> %b) { ; CHECK-LABEL: test_lt_v32int16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vlt.s16 r16, x0, x2 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vlt.s16 r16, x0, x2 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.vlt16(<32 x i16> %a, <32 x i16> %b, i32 1) ret i32 %0 @@ -217,12 +214,12 @@ define i32 @test_lt_v16uint32(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: test_lt_v16uint32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vlt.d32 r16, x0, x2 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vlt.d32 r16, x0, x2 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.vlt32(<16 x i32> %a, <16 x i32> %b, i32 0) ret i32 %0 @@ -232,12 +229,12 @@ define i32 @test_lt_v16int32(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: test_lt_v16int32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vlt.s32 r16, x0, x2 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vlt.s32 r16, x0, x2 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.vlt32(<16 x i32> %a, <16 x i32> %b, i32 1) ret i32 %0 @@ -247,13 +244,12 @@ define i64 @test_lt_v64uint8_sgn(<64 x i8> %a, <64 x i8> %b, i1 zeroext %sgn) { ; CHECK-LABEL: test_lt_v64uint8_sgn: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopx ; mov crVaddSign, r2 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vlt.d8 r25:r24, x0, x2 // Delay Slot 5 -; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 -; CHECK-NEXT: mov r0, r24 // Delay Slot 2 -; CHECK-NEXT: mov r1, r25 // Delay Slot 1 +; CHECK-NEXT: nopa ; ret lr ; nopm +; CHECK-NEXT: mov crVaddSign, r2 // Delay Slot 5 +; CHECK-NEXT: vlt.d8 r25:r24, x0, x2 // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 +; CHECK-NEXT: or r1, r25, r25; mov r0, r24 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call <2 x i32> @llvm.aie2.vlt8(<64 x i8> %a, <64 x i8> %b, i32 %conv.i) @@ -265,13 +261,12 @@ define i32 @test_lt_v32int16_sgn(<32 x i16> %a, <32 x i16> %b, i1 zeroext %sgn) ; CHECK-LABEL: test_lt_v32int16_sgn: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov crVaddSign, r1 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vlt.d16 r16, x0, x2 // Delay Slot 5 -; CHECK-NEXT: or r2, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r2 // Delay Slot 1 +; CHECK-NEXT: nopa ; nopb ; ret lr +; CHECK-NEXT: mov crVaddSign, r1 // Delay Slot 5 +; CHECK-NEXT: vlt.d16 r16, x0, x2 // Delay Slot 4 +; CHECK-NEXT: or r2, r16, r16 // Delay Slot 3 +; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 +; CHECK-NEXT: or r16, r2, r2; mov r0, r16 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call i32 @llvm.aie2.vlt16(<32 x i16> %a, <32 x i16> %b, i32 %conv.i) @@ -282,13 +277,12 @@ define i32 @test_lt_v16uint32_sgn(<16 x i32> %a, <16 x i32> %b, i1 zeroext %sgn) ; CHECK-LABEL: test_lt_v16uint32_sgn: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov crVaddSign, r1 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vlt.d32 r16, x0, x2 // Delay Slot 5 -; CHECK-NEXT: or r2, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r2 // Delay Slot 1 +; CHECK-NEXT: nopa ; nopb ; ret lr +; CHECK-NEXT: mov crVaddSign, r1 // Delay Slot 5 +; CHECK-NEXT: vlt.d32 r16, x0, x2 // Delay Slot 4 +; CHECK-NEXT: or r2, r16, r16 // Delay Slot 3 +; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 +; CHECK-NEXT: or r16, r2, r2; mov r0, r16 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call i32 @llvm.aie2.vlt32(<16 x i32> %a, <16 x i32> %b, i32 %conv.i) @@ -306,12 +300,12 @@ define i32 @test_lt_v32bfloat16(<32 x bfloat> %a, <32 x bfloat> %b) { ; CHECK-LABEL: test_lt_v32bfloat16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vlt.bf16 r16, x0, x2 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vlt.bf16 r16, x0, x2 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.vltbf16(<32 x bfloat> %a, <32 x bfloat> %b) ret i32 %0 @@ -321,12 +315,12 @@ define i32 @test_ge_v32bfloat16(<32 x bfloat> %a, <32 x bfloat> %b) { ; CHECK-LABEL: test_ge_v32bfloat16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vge.bf16 r16, x0, x2 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vge.bf16 r16, x0, x2 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.vgebf16(<32 x bfloat> %a, <32 x bfloat> %b) ret i32 %0 @@ -336,12 +330,12 @@ define i32 @test_le_v32bfloat16(<32 x bfloat> %a, <32 x bfloat> %b) { ; CHECK-LABEL: test_le_v32bfloat16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vge.bf16 r16, x2, x0 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vge.bf16 r16, x2, x0 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.vgebf16(<32 x bfloat> %b, <32 x bfloat> %a) ret i32 %0 @@ -351,12 +345,12 @@ define i32 @test_gt_v32bfloat16(<32 x bfloat> %a, <32 x bfloat> %b) { ; CHECK-LABEL: test_gt_v32bfloat16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vlt.bf16 r16, x2, x0 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vlt.bf16 r16, x2, x0 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.vltbf16(<32 x bfloat> %b, <32 x bfloat> %a) ret i32 %0 @@ -366,13 +360,12 @@ define i32 @test_ltz_v32bfloat16(<32 x bfloat> %a) { ; CHECK-LABEL: test_ltz_v32bfloat16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nop ; movxm r0, #32768 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vbcst.16 x2, r0 // Delay Slot 5 -; CHECK-NEXT: vlt.bf16 r16, x0, x2 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nopa ; ret lr +; CHECK-NEXT: movxm r0, #32768 // Delay Slot 5 +; CHECK-NEXT: vbcst.16 x2, r0 // Delay Slot 4 +; CHECK-NEXT: vlt.bf16 r16, x0, x2 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call <32 x i16> @llvm.aie2.vbroadcast16.I512(i32 32768) %1 = bitcast <32 x i16> %0 to <32 x bfloat> @@ -384,13 +377,12 @@ define i32 @test_gtz_v32bfloat16(<32 x bfloat> %a) { ; CHECK-LABEL: test_gtz_v32bfloat16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r0, #0; nopb ; nopx -; CHECK-NEXT: ret lr -; CHECK-NEXT: vbcst.16 x2, r0 // Delay Slot 5 -; CHECK-NEXT: vlt.bf16 r16, x2, x0 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nopa ; nopb ; ret lr +; CHECK-NEXT: mova r0, #0 // Delay Slot 5 +; CHECK-NEXT: vbcst.16 x2, r0 // Delay Slot 4 +; CHECK-NEXT: vlt.bf16 r16, x2, x0 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call <32 x i16> @llvm.aie2.vbroadcast16.I512(i32 0) %1 = bitcast <32 x i16> %0 to <32 x bfloat> @@ -403,13 +395,12 @@ define i32 @test_eq_v32bfloat16(<32 x bfloat> %a, <32 x bfloat> %b) { ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov r1, r16 -; CHECK-NEXT: vge.bf16 r16, x2, x0 ; CHECK-NEXT: ret lr -; CHECK-NEXT: vge.bf16 r17, x0, x2 // Delay Slot 5 -; CHECK-NEXT: or r2, r17, r17 // Delay Slot 4 -; CHECK-NEXT: and r0, r17, r16 // Delay Slot 3 -; CHECK-NEXT: mov r17, r2 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: vge.bf16 r16, x2, x0 // Delay Slot 5 +; CHECK-NEXT: vge.bf16 r17, x0, x2 // Delay Slot 4 +; CHECK-NEXT: or r2, r17, r17 // Delay Slot 3 +; CHECK-NEXT: and r0, r17, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r17, r2 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.vgebf16(<32 x bfloat> %b, <32 x bfloat> %a) %1 = tail call i32 @llvm.aie2.vgebf16(<32 x bfloat> %a, <32 x bfloat> %b) @@ -422,13 +413,12 @@ define i32 @test_ne_v32bfloat16(<32 x bfloat> %a, <32 x bfloat> %b) { ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov r1, r16 -; CHECK-NEXT: vlt.bf16 r16, x0, x2 ; CHECK-NEXT: ret lr -; CHECK-NEXT: vlt.bf16 r17, x2, x0 // Delay Slot 5 -; CHECK-NEXT: or r2, r17, r17 // Delay Slot 4 -; CHECK-NEXT: or r0, r17, r16 // Delay Slot 3 -; CHECK-NEXT: mov r17, r2 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: vlt.bf16 r16, x0, x2 // Delay Slot 5 +; CHECK-NEXT: vlt.bf16 r17, x2, x0 // Delay Slot 4 +; CHECK-NEXT: or r2, r17, r17 // Delay Slot 3 +; CHECK-NEXT: or r0, r17, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r17, r2 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.vltbf16(<32 x bfloat> %a, <32 x bfloat> %b) %1 = tail call i32 @llvm.aie2.vltbf16(<32 x bfloat> %b, <32 x bfloat> %a) diff --git a/llvm/test/CodeGen/AIE/aie2/veqz.ll b/llvm/test/CodeGen/AIE/aie2/veqz.ll index 8ab06aae2168..9e873243e116 100644 --- a/llvm/test/CodeGen/AIE/aie2/veqz.ll +++ b/llvm/test/CodeGen/AIE/aie2/veqz.ll @@ -11,12 +11,12 @@ define i64 @test_eqz_v64i8(<64 x i8> %a) { ; CHECK-LABEL: test_eqz_v64i8: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: veqz.8 r25:r24, x0 // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov r0, r24 // Delay Slot 2 -; CHECK-NEXT: mov r1, r25 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: veqz.8 r25:r24, x0 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: or r1, r25, r25; mov r0, r24 // Delay Slot 1 entry: %0 = tail call <2 x i32> @llvm.aie2.veqz8(<64 x i8> %a) %1 = bitcast <2 x i32> %0 to i64 @@ -27,12 +27,12 @@ define i32 @test_eqz_v32i16(<32 x i16> %a) { ; CHECK-LABEL: test_eqz_v32i16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: veqz.16 r16, x0 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: veqz.16 r16, x0 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.veqz16(<32 x i16> %a) ret i32 %0 @@ -42,12 +42,12 @@ define i32 @test_eqz_v16i32(<16 x i32> %a) { ; CHECK-LABEL: test_eqz_v16i32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: veqz.32 r16, x0 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: veqz.32 r16, x0 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.veqz32(<16 x i32> %a) ret i32 %0 @@ -57,12 +57,12 @@ define i32 @test_eqz_v32bf16(<32 x bfloat> %a) { ; CHECK-LABEL: test_eqz_v32bf16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: veqz.16 r16, x0 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov r0, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: veqz.16 r16, x0 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov r0, r16 // Delay Slot 1 entry: %0 = bitcast <32 x bfloat> %a to <32 x i16> %1 = tail call i32 @llvm.aie2.veqz16(<32 x i16> %0) diff --git a/llvm/test/CodeGen/AIE/aie2/vextract.ll b/llvm/test/CodeGen/AIE/aie2/vextract.ll index b52ad2ed4a7a..223bc212f476 100644 --- a/llvm/test/CodeGen/AIE/aie2/vextract.ll +++ b/llvm/test/CodeGen/AIE/aie2/vextract.ll @@ -12,13 +12,12 @@ define dso_local noundef signext i32 @_Z15test_ext_v2int4Dv16_iii(<16 x i32> nou ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopa ; mov r1, r16 -; CHECK-NEXT: mova r16, #63 ; CHECK-NEXT: ret lr -; CHECK-NEXT: mov crVaddSign, r2 // Delay Slot 5 -; CHECK-NEXT: vextract.d8 r0, x0, r16 // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: mova r16, #63 // Delay Slot 5 +; CHECK-NEXT: mov crVaddSign, r2 // Delay Slot 4 +; CHECK-NEXT: vextract.d8 r0, x0, r16 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %0 = bitcast <16 x i32> %v to <64 x i8> %1 = tail call i32 @llvm.aie2.vextract.elem8.I512(<64 x i8> %0, i32 63, i32 %sign) @@ -31,13 +30,12 @@ define dso_local noundef signext i32 @_Z15test_ext_v4int4Dv16_iii(<16 x i32> nou ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopa ; mov r1, r16 -; CHECK-NEXT: mova r16, #50 ; CHECK-NEXT: ret lr -; CHECK-NEXT: mov crVaddSign, r2 // Delay Slot 5 -; CHECK-NEXT: vextract.d16 r0, x0, r16 // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: mova r16, #50 // Delay Slot 5 +; CHECK-NEXT: mov crVaddSign, r2 // Delay Slot 4 +; CHECK-NEXT: vextract.d16 r0, x0, r16 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %0 = bitcast <16 x i32> %v to <32 x i16> %1 = tail call i32 @llvm.aie2.vextract.elem16.I512(<32 x i16> %0, i32 50, i32 %sign) @@ -50,13 +48,12 @@ define dso_local noundef i32 @_Z15test_ext_v8int4Dv16_iii(<16 x i32> noundef %v, ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopa ; mov r1, r16 -; CHECK-NEXT: mova r16, #10 ; CHECK-NEXT: ret lr -; CHECK-NEXT: mov crVaddSign, r2 // Delay Slot 5 -; CHECK-NEXT: vextract.d32 r0, x0, r16 // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: mova r16, #10 // Delay Slot 5 +; CHECK-NEXT: mov crVaddSign, r2 // Delay Slot 4 +; CHECK-NEXT: vextract.d32 r0, x0, r16 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %0 = tail call i32 @llvm.aie2.vextract.elem32.I512(<16 x i32> %v, i32 10, i32 %sign) ret i32 %0 @@ -183,16 +180,13 @@ define dso_local noundef i64 @_Z12test_ext_u64Dv16_iii(<16 x i32> noundef %v, i3 ; CHECK-LABEL: _Z12test_ext_u64Dv16_iii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; nopx ; mov r4, r16; nops -; CHECK-NEXT: mov r16, r2 -; CHECK-NEXT: mov crVaddSign, r3 -; CHECK-NEXT: vextract.d64 r25:r24, x0, r16 -; CHECK-NEXT: ret lr -; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 4 -; CHECK-NEXT: mov r16, r4 // Delay Slot 3 -; CHECK-NEXT: mov r0, r24 // Delay Slot 2 -; CHECK-NEXT: mov r1, r25 // Delay Slot 1 +; CHECK-NEXT: nopb ; nopa ; nops ; or r16, r2, r2; mov r4, r16; nopv +; CHECK-NEXT: nopa ; ret lr +; CHECK-NEXT: mov crVaddSign, r3 // Delay Slot 5 +; CHECK-NEXT: vextract.d64 r25:r24, x0, r16 // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: or r16, r4, r4; mov crVaddSign, #0 // Delay Slot 2 +; CHECK-NEXT: or r1, r25, r25; mov r0, r24 // Delay Slot 1 entry: %0 = tail call <2 x i32> @llvm.aie2.vextract.elem64.I512(<16 x i32> %v, i32 %idx, i32 %sign) %1 = bitcast <2 x i32> %0 to i64 @@ -417,16 +411,16 @@ define dso_local noundef signext i8 @_Z5test1Dv128_a(<128 x i8> noundef %vec) { ; CHECK-LABEL: _Z5test1Dv128_a: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32; nopxm +; CHECK-NEXT: nopa ; paddb [sp], #32; nopx ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: paddb [p0], #-32 ; CHECK-NEXT: lda r0, [p0, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: nop ; CHECK-NEXT: mov r3, r16 -; CHECK-NEXT: mova r1, #0 -; CHECK-NEXT: movx r2, #64 +; CHECK-NEXT: movx r1, #0; mov r2, #64 ; CHECK-NEXT: lt r27, r0, r2 ; CHECK-NEXT: sel.nez r1, r1, r2, r27 ; CHECK-NEXT: add r24, r27, #-1 @@ -448,16 +442,16 @@ define dso_local noundef signext i16 @_Z5test2Dv64_s(<64 x i16> noundef %vec) { ; CHECK-LABEL: _Z5test2Dv64_s: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32; nopxm +; CHECK-NEXT: paddb [sp], #32 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: paddb [p0], #-32 ; CHECK-NEXT: lda r0, [p0, #0] ; CHECK-NEXT: nop -; CHECK-NEXT: mov r3, r16 -; CHECK-NEXT: mov r4, r17 -; CHECK-NEXT: mova r1, #0 -; CHECK-NEXT: mova r2, #32 -; CHECK-NEXT: paddb [sp], #-32 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: or r4, r17, r17; mov r3, r16 +; CHECK-NEXT: movx r1, #0; mov r2, #32 ; CHECK-NEXT: lt r27, r0, r2 ; CHECK-NEXT: sel.nez r1, r1, r2, r27 ; CHECK-NEXT: add r17, r27, #-1 @@ -465,8 +459,8 @@ define dso_local noundef signext i16 @_Z5test2Dv64_s(<64 x i16> noundef %vec) { ; CHECK-NEXT: vsel.16 x0, x4, x5, r17 // Delay Slot 5 ; CHECK-NEXT: sub r16, r0, r1 // Delay Slot 4 ; CHECK-NEXT: vextract.s16 r0, x0, r16 // Delay Slot 3 -; CHECK-NEXT: or r17, r4, r4 // Delay Slot 2 -; CHECK-NEXT: mov r16, r3 // Delay Slot 1 +; CHECK-NEXT: paddb [sp], #-32 // Delay Slot 2 +; CHECK-NEXT: or r16, r3, r3; mov r17, r4 // Delay Slot 1 entry: %idx = alloca i32, align 4 %idx.0.idx.0.idx.0.idx.0. = load volatile i32, ptr %idx, align 4 @@ -478,16 +472,16 @@ define dso_local noundef i32 @_Z5test3Dv32_i(<32 x i32> noundef %vec) { ; CHECK-LABEL: _Z5test3Dv32_i: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32; nopxm +; CHECK-NEXT: paddb [sp], #32 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: paddb [p0], #-32 ; CHECK-NEXT: lda r0, [p0, #0] ; CHECK-NEXT: nop -; CHECK-NEXT: mov r3, r16 -; CHECK-NEXT: mov r4, r17 -; CHECK-NEXT: mova r1, #0 -; CHECK-NEXT: mova r2, #16 -; CHECK-NEXT: paddb [sp], #-32 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: or r4, r17, r17; mov r3, r16 +; CHECK-NEXT: movx r1, #0; mov r2, #16 ; CHECK-NEXT: lt r27, r0, r2 ; CHECK-NEXT: sel.nez r1, r1, r2, r27 ; CHECK-NEXT: add r17, r27, #-1 @@ -495,8 +489,8 @@ define dso_local noundef i32 @_Z5test3Dv32_i(<32 x i32> noundef %vec) { ; CHECK-NEXT: vsel.32 x0, x4, x5, r17 // Delay Slot 5 ; CHECK-NEXT: sub r16, r0, r1 // Delay Slot 4 ; CHECK-NEXT: vextract.s32 r0, x0, r16 // Delay Slot 3 -; CHECK-NEXT: or r17, r4, r4 // Delay Slot 2 -; CHECK-NEXT: mov r16, r3 // Delay Slot 1 +; CHECK-NEXT: paddb [sp], #-32 // Delay Slot 2 +; CHECK-NEXT: or r16, r3, r3; mov r17, r4 // Delay Slot 1 entry: %idx = alloca i32, align 4 %idx.0.idx.0.idx.0.idx.0. = load volatile i32, ptr %idx, align 4 diff --git a/llvm/test/CodeGen/AIE/aie2/vinsert.ll b/llvm/test/CodeGen/AIE/aie2/vinsert.ll index e9a77bb4dcef..6bdc365deb8b 100644 --- a/llvm/test/CodeGen/AIE/aie2/vinsert.ll +++ b/llvm/test/CodeGen/AIE/aie2/vinsert.ll @@ -106,11 +106,11 @@ define <2 x i32> @test_64bit(<2 x i32> %vec, i32 %c) { ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv -; CHECK-NEXT: nopx // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov r16, r18 // Delay Slot 2 -; CHECK-NEXT: mov r17, r0 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: or r17, r0, r0; mov r16, r18 // Delay Slot 1 entry: %vecins = insertelement <2 x i32> %vec, i32 %c, i32 1 ret <2 x i32> %vecins diff --git a/llvm/test/CodeGen/AIE/aie2/vmac.ll b/llvm/test/CodeGen/AIE/aie2/vmac.ll index 8e73066effd2..91f9efb4eedc 100644 --- a/llvm/test/CodeGen/AIE/aie2/vmac.ll +++ b/llvm/test/CodeGen/AIE/aie2/vmac.ll @@ -11,12 +11,8 @@ define <16 x i64> @_Z21test_mac_4x2_2x4_confiiiiii(i32 noundef %sgn_x, i32 noun ; CHECK-LABEL: _Z21test_mac_4x2_2x4_confiiiiii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; mova r6, #10; nops ; nopxm ; nopv -; CHECK-NEXT: mova r7, #11 -; CHECK-NEXT: mova r8, #12 -; CHECK-NEXT: mova r9, #9 -; CHECK-NEXT: mova r10, #8 -; CHECK-NEXT: mova r11, #2 +; CHECK-NEXT: mova r6, #10; movx r8, #12; mov r7, #11 +; CHECK-NEXT: mova r9, #9; movx r11, #2; mov r10, #8 ; CHECK-NEXT: lshl r3, r3, r6 ; CHECK-NEXT: lshl r4, r4, r7 ; CHECK-NEXT: lshl r5, r5, r8 @@ -96,9 +92,8 @@ define <16 x i64> @_Z21test_negmac_4x16_16x8Dv64_hiDv16_jiDv16_u7__acc64(<64 x ; CHECK-LABEL: _Z21test_negmac_4x16_16x8Dv64_hiDv16_jiDv16_u7__acc64: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; mova r2, #9; nops ; nopxm ; nopv -; CHECK-NEXT: mova r3, #8 -; CHECK-NEXT: lshl r0, r0, r2 +; CHECK-NEXT: nopb ; mova r2, #9; nops ; movx r3, #8; nopm ; nopv +; CHECK-NEXT: nopa ; nopb ; lshl r0, r0, r2 ; CHECK-NEXT: lshl r1, r1, r3 ; CHECK-NEXT: or r0, r1, r0 ; CHECK-NEXT: ret lr @@ -120,9 +115,7 @@ define <16 x i64> @_Z24test_negmac_4x8_8x4_confDv32_sDv64_aDv16_u7__acc64iii(<3 ; CHECK-LABEL: _Z24test_negmac_4x8_8x4_confDv32_sDv64_aDv16_u7__acc64iii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r3, #11 -; CHECK-NEXT: mova r4, #12 -; CHECK-NEXT: mova r5, #818 +; CHECK-NEXT: mova r3, #11; nopb ; movx r5, #818; mov r4, #12 ; CHECK-NEXT: lshl r1, r1, r3 ; CHECK-NEXT: lshl r2, r2, r4 ; CHECK-NEXT: or r0, r1, r0 @@ -151,9 +144,7 @@ define <16 x i64> @_Z19test_negmac_2x4_4x8Dv32_tiDv32_siDv16_u7__acc64(<32 x i1 ; CHECK-LABEL: _Z19test_negmac_2x4_4x8Dv32_tiDv32_siDv16_u7__acc64: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r2, #9; nopb ; nopx -; CHECK-NEXT: mova r3, #8 -; CHECK-NEXT: mova r4, #26 +; CHECK-NEXT: nopb ; mova r2, #9; nops ; movx r4, #26; mov r3, #8; nopv ; CHECK-NEXT: lshl r0, r0, r2 ; CHECK-NEXT: lshl r1, r1, r3 ; CHECK-NEXT: or r0, r0, r1 @@ -215,12 +206,8 @@ define <16 x i64> @_Z23test_msc_elem_16_2_confDv32_tiS_iDv16_u7__acc64iiii(<32 ; CHECK-LABEL: _Z23test_msc_elem_16_2_confDv32_tiS_iDv16_u7__acc64iiii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; mova r6, #10; nops ; nopxm ; nopv -; CHECK-NEXT: mova r7, #11 -; CHECK-NEXT: mova r8, #12 -; CHECK-NEXT: mova r9, #9 -; CHECK-NEXT: mova r10, #8 -; CHECK-NEXT: mova r11, #90 +; CHECK-NEXT: mova r6, #10; movx r8, #12; mov r7, #11 +; CHECK-NEXT: mova r9, #9; movx r11, #90; mov r10, #8 ; CHECK-NEXT: lshl r3, r3, r6 ; CHECK-NEXT: lshl r4, r4, r7 ; CHECK-NEXT: lshl r5, r5, r8 @@ -261,10 +248,8 @@ define <16 x i64> @_Z21test_msc_4x4_4x4_confDv32_tDv32_sDv16_u7__acc64iiii(<32 ; CHECK-LABEL: _Z21test_msc_4x4_4x4_confDv32_tDv32_sDv16_u7__acc64iiii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r4, #10; nopb ; nopx -; CHECK-NEXT: mova r5, #11 -; CHECK-NEXT: mova r6, #12 -; CHECK-NEXT: mova r7, #314 +; CHECK-NEXT: mova r4, #10; nopxm +; CHECK-NEXT: mova r5, #11; movx r7, #314; mov r6, #12 ; CHECK-NEXT: lshl r1, r1, r4 ; CHECK-NEXT: lshl r2, r2, r5 ; CHECK-NEXT: lshl r3, r3, r6 diff --git a/llvm/test/CodeGen/AIE/aie2/vmax_lt.ll b/llvm/test/CodeGen/AIE/aie2/vmax_lt.ll index 1947a0dd67f7..1e6a6b17e810 100644 --- a/llvm/test/CodeGen/AIE/aie2/vmax_lt.ll +++ b/llvm/test/CodeGen/AIE/aie2/vmax_lt.ll @@ -177,13 +177,12 @@ define dso_local noundef <32 x i16> @_Z21test_max_lt_v32uint16Dv32_tS_bRj(<32 x ; CHECK-LABEL: _Z21test_max_lt_v32uint16Dv32_tS_bRj: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov crVaddSign, r0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vmax_lt.d16 x0, r16, x2, x4 // Delay Slot 5 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: nopa ; nopb ; ret lr +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vmax_lt.d16 x0, r16, x2, x4 // Delay Slot 4 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <32 x i16>, i32 } @llvm.aie2.vmax.lt16(<32 x i16> %a, <32 x i16> %b, i32 %conv.i) @@ -213,12 +212,12 @@ define dso_local noundef <32 x i16> @_Z18test_max_v32uint16Dv32_tS_b(<32 x i16> ; CHECK-LABEL: _Z18test_max_v32uint16Dv32_tS_b: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm -; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 -; CHECK-NEXT: vmax_lt.d16 x0, r16, x2, x4 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nopa ; ret lr ; nopm +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 4 +; CHECK-NEXT: vmax_lt.d16 x0, r16, x2, x4 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i.i = zext i1 %sgn to i32 %0 = tail call { <32 x i16>, i32 } @llvm.aie2.vmax.lt16(<32 x i16> %a, <32 x i16> %b, i32 %conv.i.i) @@ -250,13 +249,12 @@ define dso_local noundef <32 x i16> @_Z20test_max_lt_v32int16Dv32_sS_bRj(<32 x i ; CHECK-LABEL: _Z20test_max_lt_v32int16Dv32_sS_bRj: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov crVaddSign, r0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vmax_lt.d16 x0, r16, x2, x4 // Delay Slot 5 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: nopa ; nopb ; ret lr +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vmax_lt.d16 x0, r16, x2, x4 // Delay Slot 4 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <32 x i16>, i32 } @llvm.aie2.vmax.lt16(<32 x i16> %a, <32 x i16> %b, i32 %conv.i) @@ -286,12 +284,12 @@ define dso_local noundef <32 x i16> @_Z17test_max_v32int16Dv32_sS_b(<32 x i16> n ; CHECK-LABEL: _Z17test_max_v32int16Dv32_sS_b: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm -; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 -; CHECK-NEXT: vmax_lt.d16 x0, r16, x2, x4 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nopa ; ret lr ; nopm +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 4 +; CHECK-NEXT: vmax_lt.d16 x0, r16, x2, x4 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i.i = zext i1 %sgn to i32 %0 = tail call { <32 x i16>, i32 } @llvm.aie2.vmax.lt16(<32 x i16> %a, <32 x i16> %b, i32 %conv.i.i) @@ -304,13 +302,12 @@ define dso_local noundef <16 x i32> @_Z21test_max_lt_v16uint32Dv16_jS_bRj(<16 x ; CHECK-LABEL: _Z21test_max_lt_v16uint32Dv16_jS_bRj: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov crVaddSign, r0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vmax_lt.d32 x0, r16, x2, x4 // Delay Slot 5 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: nopa ; nopb ; ret lr +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vmax_lt.d32 x0, r16, x2, x4 // Delay Slot 4 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <16 x i32>, i32 } @llvm.aie2.vmax.lt32(<16 x i32> %a, <16 x i32> %b, i32 %conv.i) @@ -340,12 +337,12 @@ define dso_local noundef <16 x i32> @_Z18test_max_v16uint32Dv16_jS_b(<16 x i32> ; CHECK-LABEL: _Z18test_max_v16uint32Dv16_jS_b: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm -; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 -; CHECK-NEXT: vmax_lt.d32 x0, r16, x2, x4 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nopa ; ret lr ; nopm +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 4 +; CHECK-NEXT: vmax_lt.d32 x0, r16, x2, x4 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i.i = zext i1 %sgn to i32 %0 = tail call { <16 x i32>, i32 } @llvm.aie2.vmax.lt32(<16 x i32> %a, <16 x i32> %b, i32 %conv.i.i) @@ -377,13 +374,12 @@ define dso_local noundef <16 x i32> @_Z20test_max_lt_v16int32Dv16_iS_bRj(<16 x i ; CHECK-LABEL: _Z20test_max_lt_v16int32Dv16_iS_bRj: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov crVaddSign, r0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vmax_lt.d32 x0, r16, x2, x4 // Delay Slot 5 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: nopa ; nopb ; ret lr +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vmax_lt.d32 x0, r16, x2, x4 // Delay Slot 4 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <16 x i32>, i32 } @llvm.aie2.vmax.lt32(<16 x i32> %a, <16 x i32> %b, i32 %conv.i) @@ -413,12 +409,12 @@ define dso_local noundef <16 x i32> @_Z17test_max_v16int32Dv16_iS_b(<16 x i32> n ; CHECK-LABEL: _Z17test_max_v16int32Dv16_iS_b: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm -; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 -; CHECK-NEXT: vmax_lt.d32 x0, r16, x2, x4 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nopa ; ret lr ; nopm +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 4 +; CHECK-NEXT: vmax_lt.d32 x0, r16, x2, x4 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i.i = zext i1 %sgn to i32 %0 = tail call { <16 x i32>, i32 } @llvm.aie2.vmax.lt32(<16 x i32> %a, <16 x i32> %b, i32 %conv.i.i) diff --git a/llvm/test/CodeGen/AIE/aie2/vmaxdiff_lt.ll b/llvm/test/CodeGen/AIE/aie2/vmaxdiff_lt.ll index a67d8d4e867d..2a107bb6aeed 100644 --- a/llvm/test/CodeGen/AIE/aie2/vmaxdiff_lt.ll +++ b/llvm/test/CodeGen/AIE/aie2/vmaxdiff_lt.ll @@ -72,13 +72,12 @@ define <32 x i16> @test_vmaxdiff_lt_v32uint16_tbRj(<32 x i16> %a, <32 x i16> % ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopb ; mova r1, #1; nops ; nopxm ; nopv ; CHECK-NEXT: and r0, r0, r1 -; CHECK-NEXT: mov crVaddSign, r0 ; CHECK-NEXT: ret lr -; CHECK-NEXT: vmaxdiff_lt.d16 x0, r16, x2, x4 // Delay Slot 5 -; CHECK-NEXT: or r2, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vmaxdiff_lt.d16 x0, r16, x2, x4 // Delay Slot 4 +; CHECK-NEXT: or r2, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r2 // Delay Slot 1 +; CHECK-NEXT: or r16, r2, r2; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <32 x i16>, i32 } @llvm.aie2.vmaxdiff.lt16(<32 x i16> %a, <32 x i16> %b, i32 %conv.i) @@ -130,13 +129,12 @@ define <16 x i32> @test_vmaxdiff_lt_v16int32(<16 x i32> %a, <16 x i32> %b, i1 % ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopb ; mova r1, #1; nops ; nopxm ; nopv ; CHECK-NEXT: and r0, r0, r1 -; CHECK-NEXT: mov crVaddSign, r0 ; CHECK-NEXT: ret lr -; CHECK-NEXT: vmaxdiff_lt.d32 x0, r16, x2, x4 // Delay Slot 5 -; CHECK-NEXT: or r2, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vmaxdiff_lt.d32 x0, r16, x2, x4 // Delay Slot 4 +; CHECK-NEXT: or r2, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r2 // Delay Slot 1 +; CHECK-NEXT: or r16, r2, r2; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <16 x i32>, i32 } @llvm.aie2.vmaxdiff.lt32(<16 x i32> %a, <16 x i32> %b, i32 %conv.i) diff --git a/llvm/test/CodeGen/AIE/aie2/vmin_ge.ll b/llvm/test/CodeGen/AIE/aie2/vmin_ge.ll index 6e4dc24218f1..7d153ae8af1c 100644 --- a/llvm/test/CodeGen/AIE/aie2/vmin_ge.ll +++ b/llvm/test/CodeGen/AIE/aie2/vmin_ge.ll @@ -177,13 +177,12 @@ define dso_local noundef <32 x i16> @_Z21test_min_ge_v32uint16Dv32_tS_bRj(<32 x ; CHECK-LABEL: _Z21test_min_ge_v32uint16Dv32_tS_bRj: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov crVaddSign, r0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vmin_ge.d16 x0, r16, x2, x4 // Delay Slot 5 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: nopa ; nopb ; ret lr +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vmin_ge.d16 x0, r16, x2, x4 // Delay Slot 4 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <32 x i16>, i32 } @llvm.aie2.vmin.ge16(<32 x i16> %a, <32 x i16> %b, i32 %conv.i) @@ -213,12 +212,12 @@ define dso_local noundef <32 x i16> @_Z18test_min_v32uint16Dv32_tS_b(<32 x i16> ; CHECK-LABEL: _Z18test_min_v32uint16Dv32_tS_b: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm -; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 -; CHECK-NEXT: vmin_ge.d16 x0, r16, x2, x4 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nopa ; ret lr ; nopm +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 4 +; CHECK-NEXT: vmin_ge.d16 x0, r16, x2, x4 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i.i = zext i1 %sgn to i32 %0 = tail call { <32 x i16>, i32 } @llvm.aie2.vmin.ge16(<32 x i16> %a, <32 x i16> %b, i32 %conv.i.i) @@ -250,13 +249,12 @@ define dso_local noundef <32 x i16> @_Z20test_min_ge_v32int16Dv32_sS_bRj(<32 x i ; CHECK-LABEL: _Z20test_min_ge_v32int16Dv32_sS_bRj: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov crVaddSign, r0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vmin_ge.d16 x0, r16, x2, x4 // Delay Slot 5 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: nopa ; nopb ; ret lr +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vmin_ge.d16 x0, r16, x2, x4 // Delay Slot 4 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <32 x i16>, i32 } @llvm.aie2.vmin.ge16(<32 x i16> %a, <32 x i16> %b, i32 %conv.i) @@ -286,12 +284,12 @@ define dso_local noundef <32 x i16> @_Z17test_min_v32int16Dv32_sS_b(<32 x i16> n ; CHECK-LABEL: _Z17test_min_v32int16Dv32_sS_b: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm -; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 -; CHECK-NEXT: vmin_ge.d16 x0, r16, x2, x4 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nopa ; ret lr ; nopm +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 4 +; CHECK-NEXT: vmin_ge.d16 x0, r16, x2, x4 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i.i = zext i1 %sgn to i32 %0 = tail call { <32 x i16>, i32 } @llvm.aie2.vmin.ge16(<32 x i16> %a, <32 x i16> %b, i32 %conv.i.i) @@ -304,13 +302,12 @@ define dso_local noundef <16 x i32> @_Z21test_min_ge_v16uint32Dv16_jS_bRj(<16 x ; CHECK-LABEL: _Z21test_min_ge_v16uint32Dv16_jS_bRj: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov crVaddSign, r0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vmin_ge.d32 x0, r16, x2, x4 // Delay Slot 5 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: nopa ; nopb ; ret lr +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vmin_ge.d32 x0, r16, x2, x4 // Delay Slot 4 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <16 x i32>, i32 } @llvm.aie2.vmin.ge32(<16 x i32> %a, <16 x i32> %b, i32 %conv.i) @@ -340,12 +337,12 @@ define dso_local noundef <16 x i32> @_Z18test_min_v16uint32Dv16_jS_b(<16 x i32> ; CHECK-LABEL: _Z18test_min_v16uint32Dv16_jS_b: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm -; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 -; CHECK-NEXT: vmin_ge.d32 x0, r16, x2, x4 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nopa ; ret lr ; nopm +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 4 +; CHECK-NEXT: vmin_ge.d32 x0, r16, x2, x4 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i.i = zext i1 %sgn to i32 %0 = tail call { <16 x i32>, i32 } @llvm.aie2.vmin.ge32(<16 x i32> %a, <16 x i32> %b, i32 %conv.i.i) @@ -377,13 +374,12 @@ define dso_local noundef <16 x i32> @_Z20test_min_ge_v16int32Dv16_iS_bRj(<16 x i ; CHECK-LABEL: _Z20test_min_ge_v16int32Dv16_iS_bRj: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov crVaddSign, r0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vmin_ge.d32 x0, r16, x2, x4 // Delay Slot 5 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: nopa ; nopb ; ret lr +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vmin_ge.d32 x0, r16, x2, x4 // Delay Slot 4 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <16 x i32>, i32 } @llvm.aie2.vmin.ge32(<16 x i32> %a, <16 x i32> %b, i32 %conv.i) @@ -413,12 +409,12 @@ define dso_local noundef <16 x i32> @_Z17test_min_v16int32Dv16_iS_b(<16 x i32> n ; CHECK-LABEL: _Z17test_min_v16int32Dv16_iS_b: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm -; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 -; CHECK-NEXT: vmin_ge.d32 x0, r16, x2, x4 // Delay Slot 4 -; CHECK-NEXT: or r1, r16, r16 // Delay Slot 3 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +; CHECK-NEXT: nopa ; ret lr ; nopm +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 4 +; CHECK-NEXT: vmin_ge.d32 x0, r16, x2, x4 // Delay Slot 3 +; CHECK-NEXT: or r1, r16, r16 // Delay Slot 2 +; CHECK-NEXT: or r16, r1, r1; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i.i = zext i1 %sgn to i32 %0 = tail call { <16 x i32>, i32 } @llvm.aie2.vmin.ge32(<16 x i32> %a, <16 x i32> %b, i32 %conv.i.i) diff --git a/llvm/test/CodeGen/AIE/aie2/vmul.ll b/llvm/test/CodeGen/AIE/aie2/vmul.ll index bc81b233615d..acdcd5607739 100644 --- a/llvm/test/CodeGen/AIE/aie2/vmul.ll +++ b/llvm/test/CodeGen/AIE/aie2/vmul.ll @@ -85,9 +85,7 @@ define <16 x i64> @_Z18test_mul_elem_32_2iDv64_hi(i32 noundef %sgn_x, <64 x i8> ; CHECK-LABEL: _Z18test_mul_elem_32_2iDv64_hi: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r2, #9; nopb ; nopx -; CHECK-NEXT: mova r3, #8 -; CHECK-NEXT: mova r4, #40 +; CHECK-NEXT: nopb ; mova r2, #9; nops ; movx r4, #40; mov r3, #8; nopv ; CHECK-NEXT: lshl r0, r0, r2 ; CHECK-NEXT: lshl r1, r1, r3 ; CHECK-NEXT: or r0, r0, r1 diff --git a/llvm/test/CodeGen/AIE/aie2/vnegmul.ll b/llvm/test/CodeGen/AIE/aie2/vnegmul.ll index 4ef426c1fd3d..9bd70b10550b 100644 --- a/llvm/test/CodeGen/AIE/aie2/vnegmul.ll +++ b/llvm/test/CodeGen/AIE/aie2/vnegmul.ll @@ -11,8 +11,7 @@ define <16 x i64> @_Z19test_negmul_4x8_8x8ii(i32 noundef %sgn_x, i32 noundef %sg ; CHECK-LABEL: _Z19test_negmul_4x8_8x8ii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r2, #9; nopb ; nopxm -; CHECK-NEXT: mova r3, #8 +; CHECK-NEXT: nopb ; mova r2, #9; nops ; movx r3, #8; nopm ; nopv ; CHECK-NEXT: lshl r0, r0, r2 ; CHECK-NEXT: lshl r1, r1, r3 ; CHECK-NEXT: or r0, r0, r1 diff --git a/llvm/test/CodeGen/AIE/aie2/vst_srs.ll b/llvm/test/CodeGen/AIE/aie2/vst_srs.ll index a91417a6e0fb..b81d92bf7633 100644 --- a/llvm/test/CodeGen/AIE/aie2/vst_srs.ll +++ b/llvm/test/CodeGen/AIE/aie2/vst_srs.ll @@ -13,8 +13,7 @@ define dso_local noundef <16 x i16> @_Z5test0Dv16_u7__acc32(<8 x i64> noundef %a ; CHECK-LABEL: _Z5test0Dv16_u7__acc32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32 -; CHECK-NEXT: mova r0, #2 +; CHECK-NEXT: padda [sp], #32; nopb ; movx r0, #2 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov s0, r0 ; CHECK-NEXT: paddb [p0], #-32 @@ -49,8 +48,7 @@ define dso_local noundef <8 x i32> @_Z5test1Dv8_u7__acc64(<8 x i64> noundef %acc ; CHECK-LABEL: _Z5test1Dv8_u7__acc64: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32 -; CHECK-NEXT: mova r0, #2 +; CHECK-NEXT: padda [sp], #32; nopb ; movx r0, #2 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov s0, r0 ; CHECK-NEXT: paddb [p0], #-32 @@ -79,8 +77,7 @@ define dso_local noundef <16 x i16> @_Z5test2Dv16_u7__acc32(<8 x i64> noundef %a ; CHECK-LABEL: _Z5test2Dv16_u7__acc32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32 -; CHECK-NEXT: mova r0, #2 +; CHECK-NEXT: padda [sp], #32; nopb ; movx r0, #2 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov s0, r0 ; CHECK-NEXT: paddb [p0], #-32 @@ -109,8 +106,7 @@ define dso_local noundef <8 x i32> @_Z5test3Dv8_u7__acc64(<8 x i64> noundef %acc ; CHECK-LABEL: _Z5test3Dv8_u7__acc64: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32 -; CHECK-NEXT: mova r0, #2 +; CHECK-NEXT: padda [sp], #32; nopb ; movx r0, #2 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov s0, r0 ; CHECK-NEXT: paddb [p0], #-32 @@ -139,8 +135,7 @@ define dso_local noundef <16 x i16> @_Z5test4Dv16_u7__acc64(<16 x i64> noundef % ; CHECK-LABEL: _Z5test4Dv16_u7__acc64: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32 -; CHECK-NEXT: mova r0, #2 +; CHECK-NEXT: padda [sp], #32; nopb ; movx r0, #2 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov s0, r0 ; CHECK-NEXT: paddb [p0], #-32 @@ -169,8 +164,7 @@ define dso_local noundef <32 x i8> @_Z5test5Dv32_u7__acc32(<16 x i64> noundef %a ; CHECK-LABEL: _Z5test5Dv32_u7__acc32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32 -; CHECK-NEXT: mova r0, #2 +; CHECK-NEXT: padda [sp], #32; nopb ; movx r0, #2 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov s0, r0 ; CHECK-NEXT: paddb [p0], #-32 @@ -199,8 +193,7 @@ define dso_local noundef <16 x i16> @_Z5test6Dv16_u7__acc64(<16 x i64> noundef % ; CHECK-LABEL: _Z5test6Dv16_u7__acc64: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32 -; CHECK-NEXT: mova r0, #2 +; CHECK-NEXT: padda [sp], #32; nopb ; movx r0, #2 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov s0, r0 ; CHECK-NEXT: paddb [p0], #-32 @@ -229,8 +222,7 @@ define dso_local noundef <32 x i8> @_Z5test7Dv32_u7__acc32(<16 x i64> noundef %a ; CHECK-LABEL: _Z5test7Dv32_u7__acc32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32 -; CHECK-NEXT: mova r0, #2 +; CHECK-NEXT: padda [sp], #32; nopb ; movx r0, #2 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov s0, r0 ; CHECK-NEXT: paddb [p0], #-32 @@ -259,8 +251,7 @@ define dso_local noundef <16 x i16> @_Z5test8Dv16_u7__acc64(<16 x i64> noundef % ; CHECK-LABEL: _Z5test8Dv16_u7__acc64: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32 -; CHECK-NEXT: mova r0, #2 +; CHECK-NEXT: padda [sp], #32; nopb ; movx r0, #2 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov s0, r0 ; CHECK-NEXT: paddb [p0], #-32 @@ -289,8 +280,7 @@ define dso_local noundef <32 x i8> @_Z5test9Dv32_u7__acc32(<16 x i64> noundef %a ; CHECK-LABEL: _Z5test9Dv32_u7__acc32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32 -; CHECK-NEXT: mova r0, #2 +; CHECK-NEXT: padda [sp], #32; nopb ; movx r0, #2 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov s0, r0 ; CHECK-NEXT: paddb [p0], #-32 @@ -319,8 +309,7 @@ define dso_local noundef <16 x i16> @_Z6test10Dv16_u7__acc32(<8 x i64> noundef % ; CHECK-LABEL: _Z6test10Dv16_u7__acc32: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32 -; CHECK-NEXT: mova r0, #4 +; CHECK-NEXT: padda [sp], #32; nopb ; movx r0, #4 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov s0, r0 ; CHECK-NEXT: paddb [p0], #-32 @@ -349,8 +338,7 @@ define dso_local noundef <8 x i32> @_Z6test11Dv8_u7__acc64(<8 x i64> noundef %ac ; CHECK-LABEL: _Z6test11Dv8_u7__acc64: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: paddb [sp], #32 -; CHECK-NEXT: mova r0, #4 +; CHECK-NEXT: padda [sp], #32; nopb ; movx r0, #4 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov s0, r0 ; CHECK-NEXT: paddb [p0], #-32 diff --git a/llvm/test/CodeGen/AIE/aie2/vsub-cmp.ll b/llvm/test/CodeGen/AIE/aie2/vsub-cmp.ll index 234f0f707a83..874cb56c3bdc 100644 --- a/llvm/test/CodeGen/AIE/aie2/vsub-cmp.ll +++ b/llvm/test/CodeGen/AIE/aie2/vsub-cmp.ll @@ -71,13 +71,12 @@ define <32 x i16> @test_vsub_ge_v32uint16_tbRj(<32 x i16> %a, <32 x i16> %b, i ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopb ; mova r1, #1; nops ; nopxm ; nopv ; CHECK-NEXT: and r0, r0, r1 -; CHECK-NEXT: mov crVaddSign, r0 ; CHECK-NEXT: ret lr -; CHECK-NEXT: vsub_ge.d16 x0, r16, x2, x4 // Delay Slot 5 -; CHECK-NEXT: or r2, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vsub_ge.d16 x0, r16, x2, x4 // Delay Slot 4 +; CHECK-NEXT: or r2, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r2 // Delay Slot 1 +; CHECK-NEXT: or r16, r2, r2; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <32 x i16>, i32 } @llvm.aie2.vsub.ge16(<32 x i16> %a, <32 x i16> %b, i32 %conv.i) @@ -129,13 +128,12 @@ define <16 x i32> @test_vsub_ge_v16int32(<16 x i32> %a, <16 x i32> %b, i1 %sgn, ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopb ; mova r1, #1; nops ; nopxm ; nopv ; CHECK-NEXT: and r0, r0, r1 -; CHECK-NEXT: mov crVaddSign, r0 ; CHECK-NEXT: ret lr -; CHECK-NEXT: vsub_ge.d32 x0, r16, x2, x4 // Delay Slot 5 -; CHECK-NEXT: or r2, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vsub_ge.d32 x0, r16, x2, x4 // Delay Slot 4 +; CHECK-NEXT: or r2, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r2 // Delay Slot 1 +; CHECK-NEXT: or r16, r2, r2; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <16 x i32>, i32 } @llvm.aie2.vsub.ge32(<16 x i32> %a, <16 x i32> %b, i32 %conv.i) @@ -246,13 +244,12 @@ define <32 x i16> @test_vsub_lt_v32uint16_tbRj(<32 x i16> %a, <32 x i16> %b, i ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopb ; mova r1, #1; nops ; nopxm ; nopv ; CHECK-NEXT: and r0, r0, r1 -; CHECK-NEXT: mov crVaddSign, r0 ; CHECK-NEXT: ret lr -; CHECK-NEXT: vsub_lt.d16 x0, r16, x2, x4 // Delay Slot 5 -; CHECK-NEXT: or r2, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vsub_lt.d16 x0, r16, x2, x4 // Delay Slot 4 +; CHECK-NEXT: or r2, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r2 // Delay Slot 1 +; CHECK-NEXT: or r16, r2, r2; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <32 x i16>, i32 } @llvm.aie2.vsub.lt16(<32 x i16> %a, <32 x i16> %b, i32 %conv.i) @@ -304,13 +301,12 @@ define <16 x i32> @test_vsub_lt_v16int32(<16 x i32> %a, <16 x i32> %b, i1 %sgn, ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopb ; mova r1, #1; nops ; nopxm ; nopv ; CHECK-NEXT: and r0, r0, r1 -; CHECK-NEXT: mov crVaddSign, r0 ; CHECK-NEXT: ret lr -; CHECK-NEXT: vsub_lt.d32 x0, r16, x2, x4 // Delay Slot 5 -; CHECK-NEXT: or r2, r16, r16 // Delay Slot 4 -; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 3 +; CHECK-NEXT: mov crVaddSign, r0 // Delay Slot 5 +; CHECK-NEXT: vsub_lt.d32 x0, r16, x2, x4 // Delay Slot 4 +; CHECK-NEXT: or r2, r16, r16 // Delay Slot 3 ; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 -; CHECK-NEXT: mov r16, r2 // Delay Slot 1 +; CHECK-NEXT: or r16, r2, r2; mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 %0 = tail call { <16 x i32>, i32 } @llvm.aie2.vsub.lt32(<16 x i32> %a, <16 x i32> %b, i32 %conv.i) diff --git a/llvm/test/CodeGen/AIE/aie2/vsub_acc.ll b/llvm/test/CodeGen/AIE/aie2/vsub_acc.ll index 591c049ab628..5ab0d9b46175 100644 --- a/llvm/test/CodeGen/AIE/aie2/vsub_acc.ll +++ b/llvm/test/CodeGen/AIE/aie2/vsub_acc.ll @@ -27,10 +27,8 @@ define dso_local noundef <16 x i64> @test_sub_conf(<16 x i64> noundef %acc1, <16 ; CHECK-LABEL: test_sub_conf: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r4, #10; nopb ; nopx -; CHECK-NEXT: mova r5, #12 -; CHECK-NEXT: mova r6, #13 -; CHECK-NEXT: mova r7, #2 +; CHECK-NEXT: mova r4, #10; nopxm +; CHECK-NEXT: mova r5, #12; movx r7, #2; mov r6, #13 ; CHECK-NEXT: lshl r1, r1, r4 ; CHECK-NEXT: lshl r2, r2, r5 ; CHECK-NEXT: lshl r3, r3, r6