[AIE2] Add more memory/ptr combiners

*Now, we can selectively split memory operations to enhance selection combiner opportunities.
Xilinx · Oct 22, 2024 · 178dd24 · 178dd24
1 parent 9fcbc8e
commit 178dd24
Show file tree

Hide file tree

Showing 14 changed files with 776 additions and 30 deletions.
diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp
@@ -1406,3 +1406,42 @@ AIE2InstrInfo::getVExtractOpInfo(const MachineInstr &MI) const {
     return std::nullopt;
   }
 }
+
+unsigned AIE2InstrInfo::getMaxLoadStoreSize() const { return 256; }
+
+bool AIE2InstrInfo::canCombineWithLoadStore(const MachineInstr &MI) const {
+
+  if (!isa<GIntrinsic>(MI))
+    return false;
+
+  const unsigned ID = cast<GIntrinsic>(MI).getIntrinsicID();
+
+  switch (ID) {
+  case Intrinsic::aie2_I256_v16_acc32_srs:
+  case Intrinsic::aie2_I256_v16_acc64_srs:
+  case Intrinsic::aie2_I256_v32_acc32_srs:
+  case Intrinsic::aie2_I256_v8_acc64_srs:
+  case Intrinsic::aie2_I512_v16_acc64_srs:
+  case Intrinsic::aie2_I512_v32_acc32_srs:
+
+  case Intrinsic::aie2_acc32_v16_I256_ups:
+  case Intrinsic::aie2_acc32_v32_I256_ups:
+  case Intrinsic::aie2_acc32_v32_I512_ups:
+  case Intrinsic::aie2_acc64_v16_I256_ups:
+  case Intrinsic::aie2_acc64_v16_I512_ups:
+  case Intrinsic::aie2_acc64_v8_I256_ups:
+    return true;
+  }
+  return false;
+}
+
+bool AIE2InstrInfo::isProfitableToSplitType(const LLT Ty) const {
+  const LLT V16S32 = LLT::fixed_vector(16, 32);
+  const LLT V32S16 = LLT::fixed_vector(32, 16);
+  const LLT V64S8 = LLT::fixed_vector(64, 8);
+
+  if (Ty == V16S32 || Ty == V32S16 || Ty == V64S8)
+    return true;
+
+  return false;
+}
diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.h b/llvm/lib/Target/AIE/AIE2InstrInfo.h
@@ -179,6 +179,12 @@ class AIE2InstrInfo : public AIE2GenInstrInfo {
   std::optional<const VExtractOpInfo>
   getVExtractOpInfo(const MachineInstr &MI) const override;
 
+  unsigned getMaxLoadStoreSize() const override;
+
+  bool canCombineWithLoadStore(const MachineInstr &MI) const override;
+
+  bool isProfitableToSplitType(const LLT Ty) const override;
+
 protected:
   SmallVector<AIEPseudoExpandInfo, 4>
   getSpillPseudoExpandInfo(const MachineInstr &MI) const override;

diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h
@@ -424,6 +424,23 @@ struct AIEBaseInstrInfo : public TargetInstrInfo {
     llvm_unreachable("Target didn't implement getVExtractOpInfo!");
   }
 
+  /// Return the maximun size for memory operations on this target.
+  virtual unsigned getMaxLoadStoreSize() const {
+    llvm_unreachable("Target didn't implement getMaxLoadStoreSize!");
+  }
+
+  /// Return true if this instruction can be combined with a memory operation.
+  virtual bool canCombineWithLoadStore(const MachineInstr &MI) const {
+    llvm_unreachable("Target didn't implement canCombineWithLoadStore!");
+  }
+
+  /// Return true if the type can be splitted to fit target's restrictions.
+  /// For example, by splitting those types in advance, it is possible to
+  /// reach more combiners during selection.
+  virtual bool isProfitableToSplitType(const LLT Ty) const {
+    llvm_unreachable("Target didn't implement isProfitableToSplitType!");
+  }
+
 protected:
   /// Expand a spill pseudo-instruction into actual target instructions. This
   /// will essentially split the register being handled into its sub-registers,

diff --git a/llvm/lib/Target/AIE/AIECombine.td b/llvm/lib/Target/AIE/AIECombine.td
@@ -109,8 +109,36 @@ def combine_add_vector_elt_undef : GICombineRule <
   (apply [{ applyAddVecEltUndef(*${root}, MRI, B); }] )
 >;
 
+def combine_load_store_split_matchdata: GIDefMatchData<"unsigned">;
+def combine_load_store_split : GICombineRule<
+  (defs root:$root, combine_load_store_split_matchdata:$matchinfo),
+  (match (wip_match_opcode G_LOAD, G_STORE): $root,
+  [{ return matchLoadStoreSplit(cast<GLoadStore>(*${root}), MRI, (const AIEBaseInstrInfo &)B.getTII(), ${matchinfo}); }]),
+  (apply [{ applyLoadStoreSplit(cast<GLoadStore>(*${root}), MRI, B, ${matchinfo}); }])
+>;
+
+def combine_offset_load_store_ptradd_matchdata: GIDefMatchData<"std::pair<Register, int64_t>">;
+def combine_offset_load_store_ptradd : GICombineRule<
+  (defs root:$root, combine_offset_load_store_ptradd_matchdata:$matchinfo),
+  (match (wip_match_opcode G_AIE_OFFSET_LOAD, G_AIE_OFFSET_STORE): $root,
+  [{ return matchOffsetLoadStorePtrAdd(*${root}, MRI, (const AIEBaseInstrInfo &)B.getTII(), ${matchinfo}); }]),
+  (apply [{ applyOffsetLoadStorePtrAdd(*${root}, MRI, B, ${matchinfo}); }])
+>;
+
+def combine_offset_load_store_share_ptradd_matchdata: GIDefMatchData<"Register">;
+def combine_offset_load_store_share_ptradd : GICombineRule<
+  (defs root:$root, combine_offset_load_store_share_ptradd_matchdata:$matchinfo),
+  (match (wip_match_opcode G_AIE_OFFSET_LOAD, G_AIE_OFFSET_STORE): $root,
+  [{ return matchOffsetLoadStoreSharePtrAdd(*${root}, MRI, Helper, (const AIEBaseInstrInfo &)B.getTII(), ${matchinfo}); }]),
+  (apply [{ applyOffsetLoadStoreSharePtrAdd(*${root}, MRI, B, ${matchinfo}); }])
+>;
+
 def AIE2PostLegalizerCustomCombiner
-    : GICombiner<"AIE2PostLegalizerCustomCombinerImpl", [ combine_load_store_increment,
+    : GICombiner<"AIE2PostLegalizerCustomCombinerImpl", [ combine_load_store_split,
+                                                          ptr_add_immed_chain,
+                                                          combine_load_store_increment,
+                                                          combine_offset_load_store_ptradd,
+                                                          combine_offset_load_store_share_ptradd,
                                                           combine_add_vector_elt_undef,
                                                           combine_extract_concat,
                                                           combine_unmerge_concat,

diff --git a/llvm/lib/Target/AIE/AIECombinerHelper.cpp b/llvm/lib/Target/AIE/AIECombinerHelper.cpp
@@ -1358,3 +1358,195 @@ void llvm::applyUpdToConcat(MachineInstr &MI, MachineRegisterInfo &MRI,
 
   MI.eraseFromParent();
 }
+
+bool llvm::matchLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI,
+                               const AIEBaseInstrInfo &TII,
+                               unsigned &MaxMemSize) {
+
+  const Register ValReg = MI.getReg(0);
+  const LLT ValTy = MRI.getType(ValReg);
+  const bool IsLoad = isa<GLoad>(MI);
+  MaxMemSize = TII.getMaxLoadStoreSize();
+
+  if (!TII.isProfitableToSplitType(ValTy))
+    return false;
+
+  /// Avoid splitting operations that can be combined `as is`.
+  if (IsLoad) {
+    for (MachineInstr &ConvInstr : MRI.use_instructions(ValReg)) {
+      if (TII.canCombineWithLoadStore(ConvInstr))
+        return false;
+    }
+  } else {
+    MachineInstr &ConvInstr = *getDefIgnoringCopiesAndBitcasts(ValReg, MRI);
+    if (TII.canCombineWithLoadStore(ConvInstr))
+      return false;
+  }
+
+  return true;
+}
+
+void llvm::applyLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI,
+                               MachineIRBuilder &B, const unsigned MaxMemSize) {
+
+  assert(MaxMemSize && "MaxMemSize should be specified!");
+  B.setInstrAndDebugLoc(MI);
+  MachineFunction &MF = B.getMF();
+  const bool IsLoad = isa<GLoad>(MI);
+  const Register ValReg = MI.getReg(0);
+  const Register AddrReg = MI.getPointerReg();
+  const LLT ValTy = MRI.getType(ValReg);
+  const LLT PtrTy = MRI.getType(AddrReg);
+  const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
+  const unsigned NumParts = ValTy.getSizeInBits() / MaxMemSize;
+  const LLT NarrowTy = ValTy.divide(NumParts);
+  const MachineMemOperand MMO = MI.getMMO();
+
+  SmallVector<Register, 8> NarrowRegs;
+  if (!IsLoad)
+    extractParts(ValReg, NarrowTy, NumParts, NarrowRegs, B, MRI);
+
+  for (int I = NumParts - 1; I >= 0; I--) {
+    const unsigned ByteOffset = I * NarrowTy.getSizeInBytes();
+    Register NewAddrReg;
+    B.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
+    MachineMemOperand *NewMMO =
+        MF.getMachineMemOperand(&MMO, ByteOffset, NarrowTy);
+
+    if (IsLoad) {
+      Register Dst = MRI.createGenericVirtualRegister(NarrowTy);
+      NarrowRegs.push_back(Dst);
+      B.buildLoad(Dst, NewAddrReg, *NewMMO);
+    } else {
+      B.buildStore(NarrowRegs[I], NewAddrReg, *NewMMO);
+    }
+  }
+
+  if (IsLoad) {
+    std::reverse(NarrowRegs.begin(), NarrowRegs.end());
+    B.buildConcatVectors(ValReg, NarrowRegs);
+  }
+
+  MI.eraseFromParent();
+}
+
+/// Match something like this:
+///  %293:_(s20) = G_CONSTANT i20 32
+///  %67:_(s20) = G_CONSTANT i20 64
+///  %68:_(p0) = nuw G_PTR_ADD %61, %67(s20)
+///  %295:_(<16 x s16>) = G_AIE_OFFSET_LOAD %68(p0), %293(s20)
+
+/// To convert to:
+///  %298:_(s20) = G_CONSTANT i20 96
+///  %295:_(<16 x s16>) = G_AIE_OFFSET_LOAD %61(p0), %298(s20)
+bool llvm::matchOffsetLoadStorePtrAdd(MachineInstr &MI,
+                                      MachineRegisterInfo &MRI,
+                                      const AIEBaseInstrInfo &TII,
+                                      std::pair<Register, int64_t> &RegOffset) {
+
+  const Register AddrReg = MI.getOperand(1).getReg();
+
+  const auto CstOffsetLoadStore =
+      getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+
+  if (!CstOffsetLoadStore)
+    return false;
+
+  MachineInstr *DefAddrRegInstr = MRI.getVRegDef(AddrReg);
+
+  if (DefAddrRegInstr->getOpcode() != TargetOpcode::G_PTR_ADD)
+    return false;
+
+  const auto CstDefAddrRegInstr = getIConstantVRegValWithLookThrough(
+      DefAddrRegInstr->getOperand(2).getReg(), MRI);
+
+  if (!CstDefAddrRegInstr)
+    return false;
+
+  RegOffset.first = DefAddrRegInstr->getOperand(1).getReg();
+  RegOffset.second = CstDefAddrRegInstr->Value.getSExtValue() +
+                     CstOffsetLoadStore->Value.getSExtValue();
+
+  return true;
+}
+
+void llvm::applyOffsetLoadStorePtrAdd(
+    MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+    const std::pair<Register, int64_t> &RegOffset) {
+  B.setInstrAndDebugLoc(MI);
+
+  Register NewOffsetReg =
+      B.buildConstant(LLT::scalar(20), RegOffset.second).getReg(0);
+
+  MI.getOperand(1).setReg(RegOffset.first);
+  MI.getOperand(2).setReg(NewOffsetReg);
+}
+
+/// Match something like this:
+///  %0:_(s20) = COPY $m0
+///  %1:_(p0) = COPY $p0
+///  %2:_(<16 x s32>) = COPY $x0
+///  %6:_(p0) = G_PTR_ADD %1, %0(s20)
+///  %18:_(s20) = G_CONSTANT i20 32
+///  G_AIE_OFFSET_STORE %15(<8 x s32>), %6(p0), %18(s20)
+///  G_AIE_OFFSET_STORE %14(<8 x s32>), %1(p0), %0(s20)
+
+/// To convert to (pointer reuse/CSE):
+///  %0:_(s20) = COPY $m0
+///  %1:_(p0) = COPY $p0
+///  %2:_(<16 x s32>) = COPY $x0
+///  %6:_(p0) = G_PTR_ADD %1, %0(s20)
+///  %18:_(s20) = G_CONSTANT i20 32
+///  %19:_(s20) = G_CONSTANT i20 0
+///  G_AIE_OFFSET_STORE %15(<8 x s32>), %6(p0), %18(s20)
+///  G_AIE_OFFSET_STORE %14(<8 x s32>), %6(p0), %19(s20)
+bool llvm::matchOffsetLoadStoreSharePtrAdd(MachineInstr &MI,
+                                           MachineRegisterInfo &MRI,
+                                           CombinerHelper &Helper,
+                                           const AIEBaseInstrInfo &TII,
+                                           Register &PtrAddReg) {
+  const Register PtrReg = MI.getOperand(1).getReg();
+  const Register OffsetReg = MI.getOperand(2).getReg();
+
+  const auto OffsetCst = getIConstantVRegValWithLookThrough(OffsetReg, MRI);
+
+  // If we have a constant here, don't touch because it is better
+  // to stay folded. Otherwise we will fold again in the previous
+  // combiner.
+  if (OffsetCst)
+    return false;
+
+  for (auto &Use : MRI.use_nodbg_instructions(PtrReg)) {
+    if (Use.getOpcode() != TargetOpcode::G_PTR_ADD)
+      continue;
+    if (Use.getOperand(2).getReg() != OffsetReg)
+      continue;
+    if (Use.getParent() != MI.getParent())
+      continue;
+    if (!Helper.dominates(Use, MI))
+      continue;
+
+    Register PaddDestReg = Use.getOperand(0).getReg();
+
+    // Dead instruction? Don't use it!
+    // Ony use if at least another instruction is using it.
+    if (hasNItemsOrMore(MRI.use_instr_nodbg_begin(PaddDestReg),
+                        MRI.use_instr_nodbg_end(), 1)) {
+      PtrAddReg = PaddDestReg;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void llvm::applyOffsetLoadStoreSharePtrAdd(MachineInstr &MI,
+                                           MachineRegisterInfo &MRI,
+                                           MachineIRBuilder &B,
+                                           Register &PtrAddReg) {
+
+  Register NewOffsetReg = B.buildConstant(LLT::scalar(20), 0).getReg(0);
+
+  MI.getOperand(1).setReg(PtrAddReg);
+  MI.getOperand(2).setReg(NewOffsetReg);
+}
diff --git a/llvm/lib/Target/AIE/AIECombinerHelper.h b/llvm/lib/Target/AIE/AIECombinerHelper.h
@@ -162,6 +162,27 @@ void applyUpdToConcat(MachineInstr &MI, MachineRegisterInfo &MRI,
                       MachineIRBuilder &B,
                       std::map<unsigned, Register> &IndexRegMap);
 
+bool matchLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI,
+                         const AIEBaseInstrInfo &TII, unsigned &MaxMemSize);
+void applyLoadStoreSplit(GLoadStore &MI, MachineRegisterInfo &MRI,
+                         MachineIRBuilder &B, const unsigned MaxMemSize);
+
+bool matchOffsetLoadStorePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                const AIEBaseInstrInfo &TII,
+                                std::pair<Register, int64_t> &RegOffset);
+
+void applyOffsetLoadStorePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                MachineIRBuilder &B,
+                                const std::pair<Register, int64_t> &RegOffset);
+
+bool matchOffsetLoadStoreSharePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                     CombinerHelper &Helper,
+                                     const AIEBaseInstrInfo &TII,
+                                     Register &PtrAddReg);
+
+void applyOffsetLoadStoreSharePtrAdd(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                     MachineIRBuilder &B, Register &PtrAddReg);
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/test/CodeGen/AIE/GlobalISel/addrspace-before-selection.ll b/llvm/test/CodeGen/AIE/GlobalISel/addrspace-before-selection.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+; This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+; (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+; RUN: llc -mtriple=aie2 -stop-before=instruction-select %s -o - 2>&1 | FileCheck %s
+
+; Test if addrspace is correctly propagated after transformations, like memory op.
+; split.
+
+define dso_local noundef<16 x i32> @addrspace_propagation(ptr addrspace(6) nocapture readonly %ptr) local_unnamed_addr #0 {
+  ; CHECK-LABEL: name: addrspace_propagation
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   liveins: $p0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:ptrregbank(p0) = COPY $p0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:modregbank(s20) = G_CONSTANT i20 128
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:modregbank(s20) = G_CONSTANT i20 160
+  ; CHECK-NEXT:   [[AIE_OFFSET_LOAD:%[0-9]+]]:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (<8 x s32>) from %ir.arrayidx.1 + 32, addrspace 6)
+  ; CHECK-NEXT:   [[AIE_OFFSET_LOAD1:%[0-9]+]]:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<8 x s32>) from %ir.arrayidx.1, addrspace 6)
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:vregbank(<16 x s32>) = G_CONCAT_VECTORS [[AIE_OFFSET_LOAD1]](<8 x s32>), [[AIE_OFFSET_LOAD]](<8 x s32>)
+  ; CHECK-NEXT:   $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+  ; CHECK-NEXT:   PseudoRET implicit $lr, implicit $x0
+entry:
+  %arrayidx.1 = getelementptr inbounds [16 x <16 x i32>], ptr addrspace(6) %ptr, i32 0, i32 2
+  %0 = load <16 x i32>, ptr addrspace(6) %arrayidx.1, align 32
+  ret <16 x i32> %0
+}