Skip to content

Commit

Permalink
[AIE2] Add more memory/ptr combiners
Browse files Browse the repository at this point in the history
*Now, we can selectively split memory operations to enhance selection combiner opportunities.
  • Loading branch information
andcarminati committed Oct 22, 2024
1 parent 9fcbc8e commit 178dd24
Show file tree
Hide file tree
Showing 14 changed files with 776 additions and 30 deletions.
39 changes: 39 additions & 0 deletions llvm/lib/Target/AIE/AIE2InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1406,3 +1406,42 @@ AIE2InstrInfo::getVExtractOpInfo(const MachineInstr &MI) const {
return std::nullopt;
}
}

unsigned AIE2InstrInfo::getMaxLoadStoreSize() const { return 256; }

bool AIE2InstrInfo::canCombineWithLoadStore(const MachineInstr &MI) const {

if (!isa<GIntrinsic>(MI))
return false;

const unsigned ID = cast<GIntrinsic>(MI).getIntrinsicID();

switch (ID) {
case Intrinsic::aie2_I256_v16_acc32_srs:
case Intrinsic::aie2_I256_v16_acc64_srs:
case Intrinsic::aie2_I256_v32_acc32_srs:
case Intrinsic::aie2_I256_v8_acc64_srs:
case Intrinsic::aie2_I512_v16_acc64_srs:
case Intrinsic::aie2_I512_v32_acc32_srs:

case Intrinsic::aie2_acc32_v16_I256_ups:
case Intrinsic::aie2_acc32_v32_I256_ups:
case Intrinsic::aie2_acc32_v32_I512_ups:
case Intrinsic::aie2_acc64_v16_I256_ups:
case Intrinsic::aie2_acc64_v16_I512_ups:
case Intrinsic::aie2_acc64_v8_I256_ups:
return true;
}
return false;
}

bool AIE2InstrInfo::isProfitableToSplitType(const LLT Ty) const {
const LLT V16S32 = LLT::fixed_vector(16, 32);
const LLT V32S16 = LLT::fixed_vector(32, 16);
const LLT V64S8 = LLT::fixed_vector(64, 8);

if (Ty == V16S32 || Ty == V32S16 || Ty == V64S8)
return true;

return false;
}
6 changes: 6 additions & 0 deletions llvm/lib/Target/AIE/AIE2InstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,12 @@ class AIE2InstrInfo : public AIE2GenInstrInfo {
std::optional<const VExtractOpInfo>
getVExtractOpInfo(const MachineInstr &MI) const override;

unsigned getMaxLoadStoreSize() const override;

bool canCombineWithLoadStore(const MachineInstr &MI) const override;

bool isProfitableToSplitType(const LLT Ty) const override;

protected:
SmallVector<AIEPseudoExpandInfo, 4>
getSpillPseudoExpandInfo(const MachineInstr &MI) const override;
Expand Down
17 changes: 17 additions & 0 deletions llvm/lib/Target/AIE/AIEBaseInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,23 @@ struct AIEBaseInstrInfo : public TargetInstrInfo {
llvm_unreachable("Target didn't implement getVExtractOpInfo!");
}

/// Return the maximun size for memory operations on this target.
virtual unsigned getMaxLoadStoreSize() const {
llvm_unreachable("Target didn't implement getMaxLoadStoreSize!");
}

/// Return true if this instruction can be combined with a memory operation.
virtual bool canCombineWithLoadStore(const MachineInstr &MI) const {
llvm_unreachable("Target didn't implement canCombineWithLoadStore!");
}

/// Return true if the type can be splitted to fit target's restrictions.
/// For example, by splitting those types in advance, it is possible to
/// reach more combiners during selection.
virtual bool isProfitableToSplitType(const LLT Ty) const {
llvm_unreachable("Target didn't implement isProfitableToSplitType!");
}

protected:
/// Expand a spill pseudo-instruction into actual target instructions. This
/// will essentially split the register being handled into its sub-registers,
Expand Down
30 changes: 29 additions & 1 deletion llvm/lib/Target/AIE/AIECombine.td
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,36 @@ def combine_add_vector_elt_undef : GICombineRule <
(apply [{ applyAddVecEltUndef(*${root}, MRI, B); }] )
>;

def combine_load_store_split_matchdata: GIDefMatchData<"unsigned">;
def combine_load_store_split : GICombineRule<
(defs root:$root, combine_load_store_split_matchdata:$matchinfo),
(match (wip_match_opcode G_LOAD, G_STORE): $root,
[{ return matchLoadStoreSplit(cast<GLoadStore>(*${root}), MRI, (const AIEBaseInstrInfo &)B.getTII(), ${matchinfo}); }]),
(apply [{ applyLoadStoreSplit(cast<GLoadStore>(*${root}), MRI, B, ${matchinfo}); }])
>;

def combine_offset_load_store_ptradd_matchdata: GIDefMatchData<"std::pair<Register, int64_t>">;
def combine_offset_load_store_ptradd : GICombineRule<
(defs root:$root, combine_offset_load_store_ptradd_matchdata:$matchinfo),
(match (wip_match_opcode G_AIE_OFFSET_LOAD, G_AIE_OFFSET_STORE): $root,
[{ return matchOffsetLoadStorePtrAdd(*${root}, MRI, (const AIEBaseInstrInfo &)B.getTII(), ${matchinfo}); }]),
(apply [{ applyOffsetLoadStorePtrAdd(*${root}, MRI, B, ${matchinfo}); }])
>;

def combine_offset_load_store_share_ptradd_matchdata: GIDefMatchData<"Register">;
def combine_offset_load_store_share_ptradd : GICombineRule<
(defs root:$root, combine_offset_load_store_share_ptradd_matchdata:$matchinfo),
(match (wip_match_opcode G_AIE_OFFSET_LOAD, G_AIE_OFFSET_STORE): $root,
[{ return matchOffsetLoadStoreSharePtrAdd(*${root}, MRI, Helper, (const AIEBaseInstrInfo &)B.getTII(), ${matchinfo}); }]),
(apply [{ applyOffsetLoadStoreSharePtrAdd(*${root}, MRI, B, ${matchinfo}); }])
>;

def AIE2PostLegalizerCustomCombiner
: GICombiner<"AIE2PostLegalizerCustomCombinerImpl", [ combine_load_store_increment,
: GICombiner<"AIE2PostLegalizerCustomCombinerImpl", [ combine_load_store_split,
ptr_add_immed_chain,
combine_load_store_increment,
combine_offset_load_store_ptradd,
combine_offset_load_store_share_ptradd,
combine_add_vector_elt_undef,
combine_extract_concat,
combine_unmerge_concat,
Expand Down
192 changes: 192 additions & 0 deletions llvm/lib/Target/AIE/AIECombinerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1358,3 +1358,195 @@ void llvm::applyUpdToConcat(MachineInstr &MI, MachineRegisterInfo &MRI,

MI.eraseFromParent();
}

bool llvm::matchLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI,
const AIEBaseInstrInfo &TII,
unsigned &MaxMemSize) {

const Register ValReg = MI.getReg(0);
const LLT ValTy = MRI.getType(ValReg);
const bool IsLoad = isa<GLoad>(MI);
MaxMemSize = TII.getMaxLoadStoreSize();

if (!TII.isProfitableToSplitType(ValTy))
return false;

/// Avoid splitting operations that can be combined `as is`.
if (IsLoad) {
for (MachineInstr &ConvInstr : MRI.use_instructions(ValReg)) {
if (TII.canCombineWithLoadStore(ConvInstr))
return false;
}
} else {
MachineInstr &ConvInstr = *getDefIgnoringCopiesAndBitcasts(ValReg, MRI);
if (TII.canCombineWithLoadStore(ConvInstr))
return false;
}

return true;
}

void llvm::applyLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, const unsigned MaxMemSize) {

assert(MaxMemSize && "MaxMemSize should be specified!");
B.setInstrAndDebugLoc(MI);
MachineFunction &MF = B.getMF();
const bool IsLoad = isa<GLoad>(MI);
const Register ValReg = MI.getReg(0);
const Register AddrReg = MI.getPointerReg();
const LLT ValTy = MRI.getType(ValReg);
const LLT PtrTy = MRI.getType(AddrReg);
const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
const unsigned NumParts = ValTy.getSizeInBits() / MaxMemSize;
const LLT NarrowTy = ValTy.divide(NumParts);
const MachineMemOperand MMO = MI.getMMO();

SmallVector<Register, 8> NarrowRegs;
if (!IsLoad)
extractParts(ValReg, NarrowTy, NumParts, NarrowRegs, B, MRI);

for (int I = NumParts - 1; I >= 0; I--) {
const unsigned ByteOffset = I * NarrowTy.getSizeInBytes();
Register NewAddrReg;
B.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
MachineMemOperand *NewMMO =
MF.getMachineMemOperand(&MMO, ByteOffset, NarrowTy);

if (IsLoad) {
Register Dst = MRI.createGenericVirtualRegister(NarrowTy);
NarrowRegs.push_back(Dst);
B.buildLoad(Dst, NewAddrReg, *NewMMO);
} else {
B.buildStore(NarrowRegs[I], NewAddrReg, *NewMMO);
}
}

if (IsLoad) {
std::reverse(NarrowRegs.begin(), NarrowRegs.end());
B.buildConcatVectors(ValReg, NarrowRegs);
}

MI.eraseFromParent();
}

/// Match something like this:
/// %293:_(s20) = G_CONSTANT i20 32
/// %67:_(s20) = G_CONSTANT i20 64
/// %68:_(p0) = nuw G_PTR_ADD %61, %67(s20)
/// %295:_(<16 x s16>) = G_AIE_OFFSET_LOAD %68(p0), %293(s20)

/// To convert to:
/// %298:_(s20) = G_CONSTANT i20 96
/// %295:_(<16 x s16>) = G_AIE_OFFSET_LOAD %61(p0), %298(s20)
bool llvm::matchOffsetLoadStorePtrAdd(MachineInstr &MI,
MachineRegisterInfo &MRI,
const AIEBaseInstrInfo &TII,
std::pair<Register, int64_t> &RegOffset) {

const Register AddrReg = MI.getOperand(1).getReg();

const auto CstOffsetLoadStore =
getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);

if (!CstOffsetLoadStore)
return false;

MachineInstr *DefAddrRegInstr = MRI.getVRegDef(AddrReg);

if (DefAddrRegInstr->getOpcode() != TargetOpcode::G_PTR_ADD)
return false;

const auto CstDefAddrRegInstr = getIConstantVRegValWithLookThrough(
DefAddrRegInstr->getOperand(2).getReg(), MRI);

if (!CstDefAddrRegInstr)
return false;

RegOffset.first = DefAddrRegInstr->getOperand(1).getReg();
RegOffset.second = CstDefAddrRegInstr->Value.getSExtValue() +
CstOffsetLoadStore->Value.getSExtValue();

return true;
}

void llvm::applyOffsetLoadStorePtrAdd(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
const std::pair<Register, int64_t> &RegOffset) {
B.setInstrAndDebugLoc(MI);

Register NewOffsetReg =
B.buildConstant(LLT::scalar(20), RegOffset.second).getReg(0);

MI.getOperand(1).setReg(RegOffset.first);
MI.getOperand(2).setReg(NewOffsetReg);
}

/// Match something like this:
/// %0:_(s20) = COPY $m0
/// %1:_(p0) = COPY $p0
/// %2:_(<16 x s32>) = COPY $x0
/// %6:_(p0) = G_PTR_ADD %1, %0(s20)
/// %18:_(s20) = G_CONSTANT i20 32
/// G_AIE_OFFSET_STORE %15(<8 x s32>), %6(p0), %18(s20)
/// G_AIE_OFFSET_STORE %14(<8 x s32>), %1(p0), %0(s20)

/// To convert to (pointer reuse/CSE):
/// %0:_(s20) = COPY $m0
/// %1:_(p0) = COPY $p0
/// %2:_(<16 x s32>) = COPY $x0
/// %6:_(p0) = G_PTR_ADD %1, %0(s20)
/// %18:_(s20) = G_CONSTANT i20 32
/// %19:_(s20) = G_CONSTANT i20 0
/// G_AIE_OFFSET_STORE %15(<8 x s32>), %6(p0), %18(s20)
/// G_AIE_OFFSET_STORE %14(<8 x s32>), %6(p0), %19(s20)
bool llvm::matchOffsetLoadStoreSharePtrAdd(MachineInstr &MI,
MachineRegisterInfo &MRI,
CombinerHelper &Helper,
const AIEBaseInstrInfo &TII,
Register &PtrAddReg) {
const Register PtrReg = MI.getOperand(1).getReg();
const Register OffsetReg = MI.getOperand(2).getReg();

const auto OffsetCst = getIConstantVRegValWithLookThrough(OffsetReg, MRI);

// If we have a constant here, don't touch because it is better
// to stay folded. Otherwise we will fold again in the previous
// combiner.
if (OffsetCst)
return false;

for (auto &Use : MRI.use_nodbg_instructions(PtrReg)) {
if (Use.getOpcode() != TargetOpcode::G_PTR_ADD)
continue;
if (Use.getOperand(2).getReg() != OffsetReg)
continue;
if (Use.getParent() != MI.getParent())
continue;
if (!Helper.dominates(Use, MI))
continue;

Register PaddDestReg = Use.getOperand(0).getReg();

// Dead instruction? Don't use it!
// Ony use if at least another instruction is using it.
if (hasNItemsOrMore(MRI.use_instr_nodbg_begin(PaddDestReg),
MRI.use_instr_nodbg_end(), 1)) {
PtrAddReg = PaddDestReg;
return true;
}
}

return false;
}

void llvm::applyOffsetLoadStoreSharePtrAdd(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B,
Register &PtrAddReg) {

Register NewOffsetReg = B.buildConstant(LLT::scalar(20), 0).getReg(0);

MI.getOperand(1).setReg(PtrAddReg);
MI.getOperand(2).setReg(NewOffsetReg);
}
21 changes: 21 additions & 0 deletions llvm/lib/Target/AIE/AIECombinerHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,27 @@ void applyUpdToConcat(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B,
std::map<unsigned, Register> &IndexRegMap);

bool matchLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI,
const AIEBaseInstrInfo &TII, unsigned &MaxMemSize);
void applyLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, const unsigned MaxMemSize);

bool matchOffsetLoadStorePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI,
const AIEBaseInstrInfo &TII,
std::pair<Register, int64_t> &RegOffset);

void applyOffsetLoadStorePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B,
const std::pair<Register, int64_t> &RegOffset);

bool matchOffsetLoadStoreSharePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI,
CombinerHelper &Helper,
const AIEBaseInstrInfo &TII,
Register &PtrAddReg);

void applyOffsetLoadStoreSharePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, Register &PtrAddReg);

} // namespace llvm

#endif
29 changes: 29 additions & 0 deletions llvm/test/CodeGen/AIE/GlobalISel/addrspace-before-selection.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
; This file is licensed under the Apache License v2.0 with LLVM Exceptions.
; See https://llvm.org/LICENSE.txt for license information.
; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
;
; (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
; RUN: llc -mtriple=aie2 -stop-before=instruction-select %s -o - 2>&1 | FileCheck %s

; Test if addrspace is correctly propagated after transformations, like memory op.
; split.

define dso_local noundef<16 x i32> @addrspace_propagation(ptr addrspace(6) nocapture readonly %ptr) local_unnamed_addr #0 {
; CHECK-LABEL: name: addrspace_propagation
; CHECK: bb.1.entry:
; CHECK-NEXT: liveins: $p0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:ptrregbank(p0) = COPY $p0
; CHECK-NEXT: [[C:%[0-9]+]]:modregbank(s20) = G_CONSTANT i20 128
; CHECK-NEXT: [[C1:%[0-9]+]]:modregbank(s20) = G_CONSTANT i20 160
; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (<8 x s32>) from %ir.arrayidx.1 + 32, addrspace 6)
; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<8 x s32>) from %ir.arrayidx.1, addrspace 6)
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vregbank(<16 x s32>) = G_CONCAT_VECTORS [[AIE_OFFSET_LOAD1]](<8 x s32>), [[AIE_OFFSET_LOAD]](<8 x s32>)
; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>)
; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0
entry:
%arrayidx.1 = getelementptr inbounds [16 x <16 x i32>], ptr addrspace(6) %ptr, i32 0, i32 2
%0 = load <16 x i32>, ptr addrspace(6) %arrayidx.1, align 32
ret <16 x i32> %0
}
Loading

0 comments on commit 178dd24

Please sign in to comment.