From 8bf356fb84615cbe0622eaae1892840de7c8f3e6 Mon Sep 17 00:00:00 2001
From: Krishnam Tibrewala <krishnam.tibrewala@amd.com>
Date: Thu, 3 Oct 2024 05:23:47 -0700
Subject: [PATCH] [AIEX] Re-assign multi-slot instructions during iterative
 scheduling

---
 llvm/lib/Target/AIE/AIEAlternateDescriptors.h |   7 +
 llvm/lib/Target/AIE/AIEHazardRecognizer.cpp   |   2 +-
 .../Target/AIE/AIEInterBlockScheduling.cpp    |  26 +-
 llvm/lib/Target/AIE/AIEMachineScheduler.cpp   |  42 ++-
 llvm/lib/Target/AIE/AIEPostPipeliner.cpp      |   9 +-
 .../schedule/loopaware/loop-multiSlot.mir     | 301 ++++++++++++++++++
 6 files changed, 350 insertions(+), 37 deletions(-)
 create mode 100644 llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-multiSlot.mir

diff --git a/llvm/lib/Target/AIE/AIEAlternateDescriptors.h b/llvm/lib/Target/AIE/AIEAlternateDescriptors.h
index 01673a348efd..24636b21f085 100644
--- a/llvm/lib/Target/AIE/AIEAlternateDescriptors.h
+++ b/llvm/lib/Target/AIE/AIEAlternateDescriptors.h
@@ -31,6 +31,9 @@ class AIEAlternateDescriptors {
   AIEAlternateDescriptors() = default;
   ~AIEAlternateDescriptors() = default;
 
+  MIAltDescsMap::const_iterator begin() const { return AlternateDescs.begin(); }
+  MIAltDescsMap::const_iterator end() const { return AlternateDescs.end(); }
+
   // Construct an alternate descriptor with the given alternate descriptors.
   AIEAlternateDescriptors(const MIAltDescsMap &AltDescs)
       : AlternateDescs(AltDescs) {}
@@ -43,6 +46,10 @@ class AIEAlternateDescriptors {
     AlternateDescs[MI] = &TII->get(AltInstOpcode);
   }
 
+  void setAlternateDescriptor(MachineInstr *MI, const MCInstrDesc *AltDesc) {
+    AlternateDescs[MI] = AltDesc;
+  }
+
   // Return the alternate descriptor for the given multi-opcode instruction.
   std::optional<const MCInstrDesc *>
   getSelectedDescriptor(MachineInstr *MI) const {
diff --git a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp
index 5eb8dfa3a943..fdb0b039bd41 100644
--- a/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp
+++ b/llvm/lib/Target/AIE/AIEHazardRecognizer.cpp
@@ -447,7 +447,7 @@ ScheduleHazardRecognizer::HazardType AIEHazardRecognizer::getHazardType(
 bool AIEHazardRecognizer::checkConflict(
     const ResourceScoreboard<FuncUnitWrapper> &Scoreboard, MachineInstr &MI,
     int DeltaCycles) const {
-  const MCInstrDesc &Desc = MI.getDesc();
+  const MCInstrDesc &Desc = *SelectedAltDescs.getDesc(&MI);
   const unsigned SchedClass =
       TII->getSchedClass(Desc, MI.operands(), MI.getMF()->getRegInfo());
   const MemoryBankBits MemoryBanks = getMemoryBanks(&MI);
diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
index 40825b6fd3cf..4132823d43bf 100644
--- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
+++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
@@ -70,8 +70,9 @@ void emitBundlesTopDown(const std::vector<MachineBundle> &Bundles,
   // then this will not cause conflicts.
   for (int I = TotalBundles - AmountToEmit; I < TotalBundles; I++) {
     for (MachineInstr *MI : Bundles[I].getInstrs())
-      HR->emitInScoreboard(Scoreboard, MI->getDesc(), HR->getMemoryBanks(MI),
-                           MI->operands(), MI->getMF()->getRegInfo(), 0);
+      HR->emitInScoreboard(Scoreboard, *HR->getSelectedAltDescs().getDesc(MI),
+                           HR->getMemoryBanks(MI), MI->operands(),
+                           MI->getMF()->getRegInfo(), 0);
     Scoreboard.advance();
   }
 }
@@ -100,8 +101,9 @@ createBottomUpScoreboard(ArrayRef<MachineBundle> Bundles,
       Bundles.begin(), Bundles.begin() + std::min(NumBundles, RequiredCycles));
   for (const MachineBundle &B : reverse(MinBundles)) {
     for (MachineInstr *MI : B.getInstrs())
-      HR.emitInScoreboard(Scoreboard, MI->getDesc(), HR.getMemoryBanks(MI),
-                          MI->operands(), MI->getMF()->getRegInfo(), 0);
+      HR.emitInScoreboard(Scoreboard, *HR.getSelectedAltDescs().getDesc(MI),
+                          HR.getMemoryBanks(MI), MI->operands(),
+                          MI->getMF()->getRegInfo(), 0);
     Scoreboard.recede();
   }
   return Scoreboard;
@@ -124,9 +126,9 @@ checkResourceConflicts(const ResourceScoreboard<FuncUnitWrapper> &Scoreboard,
     for (MachineInstr *MI : B.getInstrs()) {
       if (BottomUpCycle >= HR.getConflictHorizon())
         break;
-      if (HR.getHazardType(Scoreboard, MI->getDesc(), HR.getMemoryBanks(MI),
-                           MI->operands(), MI->getMF()->getRegInfo(),
-                           -BottomUpCycle))
+      if (HR.getHazardType(Scoreboard, *HR.getSelectedAltDescs().getDesc(MI),
+                           HR.getMemoryBanks(MI), MI->operands(),
+                           MI->getMF()->getRegInfo(), -BottomUpCycle))
         return MI;
     }
     ++BottomUpCycle;
@@ -233,6 +235,7 @@ namespace {
 /// into the appropriate blockstate region.
 /// TimedRegion is built one bundle at the time
 class PipelineExtractor : public PipelineScheduleVisitor {
+  AIEAlternateDescriptors &AlternateDesc;
   BlockState &Loop;
   BlockState *Prologue = nullptr;
   BlockState *Epilogue = nullptr;
@@ -263,14 +266,19 @@ class PipelineExtractor : public PipelineScheduleVisitor {
     // Prologue and epilogue obtain copies.
     MachineInstr *ToBeEmitted =
         InLoop ? MI : Loop.TheBlock->getParent()->CloneMachineInstr(MI);
-    CurrentBundle.add(ToBeEmitted);
+    if (auto AltDesc = AlternateDesc.getSelectedDescriptor(MI);
+        AltDesc.has_value())
+      AlternateDesc.setAlternateDescriptor(ToBeEmitted, AltDesc.value());
+
+    CurrentBundle.add(ToBeEmitted, AlternateDesc.getOpcode(MI));
   }
   void endBundle() override { TimedRegion.emplace_back(CurrentBundle); }
 
 public:
   PipelineExtractor(InterBlockScheduling &InterBlock, BlockState &BS,
                     const AIEBaseInstrInfo &TII)
-      : Loop(BS), CurrentBundle(TII.getFormatInterface()) {
+      : AlternateDesc(InterBlock.getSelectedAltDescs()), Loop(BS),
+        CurrentBundle(TII.getFormatInterface()) {
     MachineBasicBlock *LoopBlock = Loop.TheBlock;
     for (auto *P : LoopBlock->predecessors()) {
       if (P == LoopBlock) {
diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
index 5b27c303933f..e756544c32a3 100644
--- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
+++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
@@ -87,6 +87,10 @@ static cl::opt<bool> UseLoopHeuristics(
     "aie-loop-sched-heuristics", cl::init(true),
     cl::desc("Use special picking heuristics when scheduling a loop region"));
 
+static cl::opt<bool> ReAssignMultiSlotInstr(
+    "aie-reassign-multislot-instr", cl::init(true),
+    cl::desc("Re-assign multi-slot instructions during iterative scheduling"));
+
 namespace {
 // A sentinel value to represent an unknown SUnit.
 const constexpr unsigned UnknownSUNum = ~0;
@@ -535,7 +539,20 @@ void AIEPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) {
   IsBottomRegion = true;
 }
 
+void AIEPostRASchedStrategy::materializeMultiOpcodeInstrs() {
+  for (auto [MI, Desc] : make_range(InterBlock.getSelectedAltDescs().begin(),
+                                    InterBlock.getSelectedAltDescs().end())) {
+    MI->setDesc(*Desc);
+  }
+
+  InterBlock.getSelectedAltDescs().clear();
+}
+
 void AIEPostRASchedStrategy::commitBlockSchedule(MachineBasicBlock *BB) {
+
+  if (ReAssignMultiSlotInstr)
+    materializeMultiOpcodeInstrs();
+
   auto &BS = InterBlock.getBlockState(BB);
 
   // Safety margin, swp epilogue
@@ -598,8 +615,6 @@ void AIEPostRASchedStrategy::leaveRegion(const SUnit &ExitSU) {
   if (BS.FixPoint.Stage != SchedulingStage::Scheduling) {
     return;
   }
-  materializeMultiOpcodeInstrs();
-  InterBlock.getSelectedAltDescs().clear();
   if (IsBottomRegion) {
     // This is the earliest point where we can destroy the recorded
     // schedule in iterative scheduling. enterMBB and enterRegion are too early,
@@ -615,6 +630,8 @@ void AIEPostRASchedStrategy::leaveRegion(const SUnit &ExitSU) {
   assert(BS.getCurrentRegion().Bundles.empty());
   BS.addBundles(TopBundles);
   BS.addBundles(BotBundles);
+  if (!ReAssignMultiSlotInstr)
+    materializeMultiOpcodeInstrs();
   RegionBegin = nullptr;
   RegionEnd = nullptr;
   IsBottomRegion = false;
@@ -622,27 +639,6 @@ void AIEPostRASchedStrategy::leaveRegion(const SUnit &ExitSU) {
   DEBUG_BLOCKS(dbgs() << "    << leaveRegion\n");
 }
 
-void AIEPostRASchedStrategy::materializeMultiOpcodeInstrs() {
-  const TargetInstrInfo *TII = getTII(CurMBB);
-  const AIEHazardRecognizer &TopHazardRec = *getAIEHazardRecognizer(Top);
-  const AIEHazardRecognizer &BotHazardRec = *getAIEHazardRecognizer(Bot);
-
-  auto MaterializePseudo = [&TII](MachineInstr &MI,
-                                  const AIEHazardRecognizer &HazardRec) {
-    // Materialize instructions with multiple opcode options
-    if (std::optional<unsigned> AltOpcode =
-            HazardRec.getSelectedAltDescs().getSelectedOpcode(&MI)) {
-      MI.setDesc(TII->get(*AltOpcode));
-    }
-  };
-
-  assert(DAG->top() == DAG->bottom());
-  for (MachineInstr &MI : make_range(DAG->begin(), DAG->top()))
-    MaterializePseudo(MI, TopHazardRec);
-  for (MachineInstr &MI : make_range(DAG->bottom(), DAG->end()))
-    MaterializePseudo(MI, BotHazardRec);
-}
-
 bool AIEPostRASchedStrategy::checkInterZoneConflicts(
     const std::vector<AIE::MachineBundle> &BotBundles) const {
   const AIEHazardRecognizer *TopHazardRec = getAIEHazardRecognizer(Top);
diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp
index e20b59c57b5f..0ec91eff07a7 100644
--- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp
+++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp
@@ -266,11 +266,12 @@ bool PostPipeliner::scheduleFirstIteration() {
       return false;
     }
     const int LocalCycle = Actual % II;
+    const MCInstrDesc &Desc = *HR.getSelectedAltDescs().getDesc(MI);
     const MemoryBankBits MemoryBanks = HR.getMemoryBanks(MI);
     LLVM_DEBUG(dbgs() << "  Emit in " << -Depth + LocalCycle << "\n");
     int Cycle = -Depth + LocalCycle;
     LLVM_DEBUG(dbgs() << "  Emit in " << Cycle << "\n");
-    HR.emitInScoreboard(Scoreboard, MI->getDesc(), MemoryBanks, MI->operands(),
+    HR.emitInScoreboard(Scoreboard, Desc, MemoryBanks, MI->operands(),
                         MI->getMF()->getRegInfo(), Cycle);
 
     scheduleNode(SU, Actual);
@@ -317,12 +318,12 @@ bool PostPipeliner::scheduleOtherIterations() {
         LLVM_DEBUG(dbgs() << "  Resource conflict\n");
         return false;
       }
+      const MCInstrDesc &Desc = *HR.getSelectedAltDescs().getDesc(MI);
       const MemoryBankBits MemoryBanks = HR.getMemoryBanks(MI);
       const int LocalCycle = (Insert - CurrentCycle) % II;
       LLVM_DEBUG(dbgs() << "  Emit in " << -Depth + LocalCycle << "\n");
-      HR.emitInScoreboard(Scoreboard, MI->getDesc(), MemoryBanks,
-                          MI->operands(), MI->getMF()->getRegInfo(),
-                          -Depth + LocalCycle);
+      HR.emitInScoreboard(Scoreboard, Desc, MemoryBanks, MI->operands(),
+                          MI->getMF()->getRegInfo(), -Depth + LocalCycle);
       scheduleNode(SU, Insert);
     }
   }
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-multiSlot.mir b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-multiSlot.mir
new file mode 100644
index 000000000000..d774ea6327a6
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/loopaware/loop-multiSlot.mir
@@ -0,0 +1,301 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+# RUN: llc --mtriple=aie2 --run-pass=postmisched  --aie-reassign-multislot-instr=true  %s -o - | FileCheck %s --check-prefix=ON
+# RUN: llc --mtriple=aie2 --run-pass=postmisched  --aie-reassign-multislot-instr=false %s -o - | FileCheck %s --check-prefix=OFF
+
+---
+name:            multislot_across_loop
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  ; ON-LABEL: name: multislot_across_loop
+  ; ON: bb.0:
+  ; ON-NEXT:   successors: %bb.1(0x80000000)
+  ; ON-NEXT:   liveins: $p0, $r0, $r1, $r2
+  ; ON-NEXT: {{  $}}
+  ; ON-NEXT: bb.1:
+  ; ON-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; ON-NEXT:   liveins: $cm0, $cm1, $cm2, $m0, $m1, $p0, $p1, $p2, $r0, $r1, $r2, $r3, $s0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
+  ; ON-NEXT: {{  $}}
+  ; ON-NEXT:   $wh3 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; ON-NEXT:   BUNDLE implicit-def $wl3, implicit-def $srsrs_of, implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit killed $cm2, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x3, implicit $x5, implicit $r0 {
+  ; ON-NEXT:     $wl3 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; ON-NEXT:     $cm4 = VMUL_vmac_cm_core_dense $x3, $x5, $r0
+  ; ON-NEXT:   }
+  ; ON-NEXT:   $cm3 = VMUL_vmac_cm_core_dense killed $x7, $x9, $r0
+  ; ON-NEXT:   BUNDLE implicit-def $wh2, implicit-def $srsrs_of, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit killed $x6, implicit $x8, implicit $r0 {
+  ; ON-NEXT:     $wh2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; ON-NEXT:     $cm1 = VMUL_vmac_cm_core_dense killed $x6, $x8, $r0
+  ; ON-NEXT:   }
+  ; ON-NEXT:   VST_dmw_sts_w_ag_idx_imm killed $wh3, $p2, 32 :: (store (<8 x s32>), addrspace 7)
+  ; ON-NEXT:   $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl3, killed $p2, 64 :: (store (<8 x s32>), addrspace 7)
+  ; ON-NEXT:   BUNDLE implicit-def $wl2, implicit-def $srsrs_of, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit killed $cm4, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x2, implicit killed $x4, implicit $r0 {
+  ; ON-NEXT:     $wl2 = VSRS_S8_S32_mv_w_srs killed $cm4, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; ON-NEXT:     $cm2 = VMUL_vmac_cm_core_dense $x2, killed $x4, $r0
+  ; ON-NEXT:   }
+  ; ON-NEXT:   $wh4 = VSRS_S8_S32_mv_w_srs killed $cm3, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; ON-NEXT:   $wl4 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; ON-NEXT:   VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7)
+  ; ON-NEXT:   $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7)
+  ; ON-NEXT:   $wl2 = VLDB_dmw_ldb_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6)
+  ; ON-NEXT:   VST_dmw_sts_w_ag_idx_imm $wh4, $p2, 32 :: (store (<8 x s32>), addrspace 7)
+  ; ON-NEXT:   $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl4, killed $p2, 64 :: (store (<8 x s32>), addrspace 7)
+  ; ON-NEXT:   BUNDLE implicit-def $wl6, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $p0, implicit $x10, implicit killed $x1, implicit $r0 {
+  ; ON-NEXT:     $wl6 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5)
+  ; ON-NEXT:     $cm0 = VMUL_vmac_cm_core_dense $x10, killed $x1, $r0
+  ; ON-NEXT:   }
+  ; ON-NEXT:   NOP
+  ; ON-NEXT:   NOP
+  ; ON-NEXT:   BUNDLE implicit-def $wl4, implicit-def $p1, implicit-def $wl6, implicit-def $p0, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0 {
+  ; ON-NEXT:     $wl4, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6)
+  ; ON-NEXT:     $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5)
+  ; ON-NEXT:   }
+  ; ON-NEXT:   $wh2 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; ON-NEXT:   BUNDLE implicit-def $wl10, implicit-def $wl1, implicit-def $wl2, implicit-def $srsrs_of, implicit-def $wh6, implicit $p1, implicit $p0, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit $wl0 {
+  ; ON-NEXT:     $wl10 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6)
+  ; ON-NEXT:     $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5)
+  ; ON-NEXT:     $wl2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; ON-NEXT:     $wh6 = VMOV_mv_w $wl0
+  ; ON-NEXT:   }
+  ; ON-NEXT:   BUNDLE implicit-def $wl5, implicit-def $p1, implicit-def $wl3, implicit-def $p0, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0 {
+  ; ON-NEXT:     $wl5, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6)
+  ; ON-NEXT:     $wl3, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5)
+  ; ON-NEXT:   }
+  ; ON-NEXT:   BUNDLE implicit-def $wl9, implicit-def $wl7, implicit-def $r3, implicit-def $srcarry, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit $p1, implicit $p0, implicit killed $r3, implicit $x6, implicit $x2, implicit $r0 {
+  ; ON-NEXT:     $wl9 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6)
+  ; ON-NEXT:     $wl7 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5)
+  ; ON-NEXT:     $r3 = ADD_add_r_ri killed $r3, -4, implicit-def $srcarry
+  ; ON-NEXT:     $cm1 = VMUL_vmac_cm_core_dense $x6, $x2, $r0
+  ; ON-NEXT:   }
+  ; ON-NEXT:   BUNDLE implicit killed $wh2, implicit $p2, implicit $r3 {
+  ; ON-NEXT:     VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7)
+  ; ON-NEXT:     JNZ $r3, %bb.1
+  ; ON-NEXT:   }
+  ; ON-NEXT:   BUNDLE implicit-def $p2, implicit-def $wh1, implicit killed $wl2, implicit killed $p2, implicit $wl0 {
+  ; ON-NEXT:     $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7)
+  ; ON-NEXT:     $wh1 = VMOV_mv_w $wl0
+  ; ON-NEXT:   }
+  ; ON-NEXT:   BUNDLE implicit-def $wl8, implicit-def $p1, implicit-def $wl6, implicit-def $p0, implicit-def $wh7, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0, implicit $wl0, implicit $x6, implicit $x4, implicit $r0 {
+  ; ON-NEXT:     $wl8, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6)
+  ; ON-NEXT:     $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5)
+  ; ON-NEXT:     $wh7 = VMOV_mv_w $wl0
+  ; ON-NEXT:     $cm2 = VMUL_vmac_cm_core_dense $x6, $x4, $r0
+  ; ON-NEXT:   }
+  ; ON-NEXT:   BUNDLE implicit-def $wl4, implicit-def $wh3, implicit $p1, implicit $wl0 {
+  ; ON-NEXT:     $wl4 = VLDB_dmw_ldb_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6)
+  ; ON-NEXT:     $wh3 = VMOV_mv_w $wl0
+  ; ON-NEXT:   }
+  ; ON-NEXT:   BUNDLE implicit-def $wl2, implicit-def $wh2, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $p0, implicit $wl0, implicit $x1, implicit killed $x10, implicit $r0 {
+  ; ON-NEXT:     $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5)
+  ; ON-NEXT:     $wh2 = VMOV_mv_w $wl0
+  ; ON-NEXT:     $cm0 = VMUL_vmac_cm_core_dense $x1, killed $x10, $r0
+  ; ON-NEXT:   }
+  ; ON-NEXT:   BUNDLE implicit-def $wl10, implicit-def $p0, implicit-def $wl1, implicit-def $p1, implicit-def $wh10, implicit killed $p0, implicit $m0, implicit killed $p1, implicit $m1, implicit $wl0 {
+  ; ON-NEXT:     $wl10, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5)
+  ; ON-NEXT:     $wl1, $p1 = VLDB_dmw_ldb_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6)
+  ; ON-NEXT:     $wh10 = VMOV_mv_w $wl0
+  ; ON-NEXT:   }
+  ; ON-NEXT:   DelayedSchedBarrier
+  ; ON-NEXT: {{  $}}
+  ; ON-NEXT: bb.2:
+  ; ON-NEXT:   successors: %bb.3(0x80000000)
+  ; ON-NEXT:   liveins: $r1, $r2
+  ; ON-NEXT: {{  $}}
+  ; ON-NEXT:   NOP
+  ; ON-NEXT:   NOP
+  ; ON-NEXT:   NOP
+  ; ON-NEXT:   NOP
+  ; ON-NEXT:   NOP
+  ; ON-NEXT:   $r2 = OR killed $r2, killed $r1
+  ; ON-NEXT: {{  $}}
+  ; ON-NEXT: bb.3:
+  ; ON-NEXT:   liveins: $r2
+  ; ON-NEXT: {{  $}}
+  ; ON-NEXT:   RET implicit $lr
+  ; ON-NEXT:   NOP
+  ; ON-NEXT:   NOP
+  ; ON-NEXT:   NOP
+  ; ON-NEXT:   NOP
+  ; ON-NEXT:   NOP
+  ; ON-NEXT:   DelayedSchedBarrier implicit killed $r2
+  ;
+  ; OFF-LABEL: name: multislot_across_loop
+  ; OFF: bb.0:
+  ; OFF-NEXT:   successors: %bb.1(0x80000000)
+  ; OFF-NEXT:   liveins: $p0, $r0, $r1, $r2
+  ; OFF-NEXT: {{  $}}
+  ; OFF-NEXT: bb.1:
+  ; OFF-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; OFF-NEXT:   liveins: $cm0, $cm1, $cm2, $m0, $m1, $p0, $p1, $p2, $r0, $r1, $r2, $r3, $s0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
+  ; OFF-NEXT: {{  $}}
+  ; OFF-NEXT:   $wh3 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; OFF-NEXT:   BUNDLE implicit-def $wl3, implicit-def $srsrs_of, implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit killed $cm2, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x3, implicit $x5, implicit $r0 {
+  ; OFF-NEXT:     $wl3 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; OFF-NEXT:     $cm4 = VMUL_vmac_cm_core_dense $x3, $x5, $r0
+  ; OFF-NEXT:   }
+  ; OFF-NEXT:   $cm3 = VMUL_vmac_cm_core_dense killed $x7, $x9, $r0
+  ; OFF-NEXT:   BUNDLE implicit-def $wh2, implicit-def $srsrs_of, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit killed $x6, implicit $x8, implicit $r0 {
+  ; OFF-NEXT:     $wh2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; OFF-NEXT:     $cm1 = VMUL_vmac_cm_core_dense killed $x6, $x8, $r0
+  ; OFF-NEXT:   }
+  ; OFF-NEXT:   VST_dmw_sts_w_ag_idx_imm killed $wh3, $p2, 32 :: (store (<8 x s32>), addrspace 7)
+  ; OFF-NEXT:   $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl3, killed $p2, 64 :: (store (<8 x s32>), addrspace 7)
+  ; OFF-NEXT:   BUNDLE implicit-def $wl2, implicit-def $srsrs_of, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit killed $cm4, implicit $s0, implicit $crsat, implicit $crrnd, implicit $x2, implicit killed $x4, implicit $r0 {
+  ; OFF-NEXT:     $wl2 = VSRS_S8_S32_mv_w_srs killed $cm4, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; OFF-NEXT:     $cm2 = VMUL_vmac_cm_core_dense $x2, killed $x4, $r0
+  ; OFF-NEXT:   }
+  ; OFF-NEXT:   $wh4 = VSRS_S8_S32_mv_w_srs killed $cm3, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; OFF-NEXT:   $wl4 = VSRS_S8_S32_mv_w_srs killed $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; OFF-NEXT:   VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7)
+  ; OFF-NEXT:   $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7)
+  ; OFF-NEXT:   $wl2 = VLDB_dmw_ldb_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6)
+  ; OFF-NEXT:   VST_dmw_sts_w_ag_idx_imm $wh4, $p2, 32 :: (store (<8 x s32>), addrspace 7)
+  ; OFF-NEXT:   $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl4, killed $p2, 64 :: (store (<8 x s32>), addrspace 7)
+  ; OFF-NEXT:   BUNDLE implicit-def $wl6, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $p0, implicit $x10, implicit killed $x1, implicit $r0 {
+  ; OFF-NEXT:     $wl6 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5)
+  ; OFF-NEXT:     $cm0 = VMUL_vmac_cm_core_dense $x10, killed $x1, $r0
+  ; OFF-NEXT:   }
+  ; OFF-NEXT:   NOP
+  ; OFF-NEXT:   NOP
+  ; OFF-NEXT:   NOP
+  ; OFF-NEXT:   BUNDLE implicit-def $wl4, implicit-def $p1, implicit-def $wl6, implicit-def $p0, implicit-def $wh2, implicit-def $srsrs_of, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0, implicit killed $cm2, implicit $s0, implicit $crsat, implicit $crrnd {
+  ; OFF-NEXT:     $wl4, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6)
+  ; OFF-NEXT:     $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5)
+  ; OFF-NEXT:     $wh2 = VSRS_S8_S32_mv_w_srs killed $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; OFF-NEXT:   }
+  ; OFF-NEXT:   BUNDLE implicit-def $wl10, implicit-def $wl1, implicit-def $wl2, implicit-def $srsrs_of, implicit-def $wh6, implicit $p1, implicit $p0, implicit killed $cm0, implicit $s0, implicit $crsat, implicit $crrnd, implicit $wl0 {
+  ; OFF-NEXT:     $wl10 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6)
+  ; OFF-NEXT:     $wl1 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5)
+  ; OFF-NEXT:     $wl2 = VSRS_S8_S32_mv_w_srs killed $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+  ; OFF-NEXT:     $wh6 = VMOV_mv_w $wl0
+  ; OFF-NEXT:   }
+  ; OFF-NEXT:   BUNDLE implicit-def $wl5, implicit-def $p1, implicit-def $wl3, implicit-def $p0, implicit killed $p1, implicit $m1, implicit killed $p0, implicit $m0 {
+  ; OFF-NEXT:     $wl5, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6)
+  ; OFF-NEXT:     $wl3, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5)
+  ; OFF-NEXT:   }
+  ; OFF-NEXT:   BUNDLE implicit-def $wl9, implicit-def $wl7, implicit-def $cm1, implicit-def $bml1, implicit-def $amll1, implicit-def $amlh1, implicit-def $bmh1, implicit-def $amhl1, implicit-def $amhh1, implicit $p1, implicit $p0, implicit $x6, implicit $x2, implicit $r0 {
+  ; OFF-NEXT:     $wl9 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6)
+  ; OFF-NEXT:     $wl7 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5)
+  ; OFF-NEXT:     $cm1 = VMUL_vmac_cm_core_dense $x6, $x2, $r0
+  ; OFF-NEXT:   }
+  ; OFF-NEXT:   BUNDLE implicit-def $r3, implicit-def $srcarry, implicit killed $wh2, implicit $p2, implicit killed $r3 {
+  ; OFF-NEXT:     VST_dmw_sts_w_ag_idx_imm killed $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7)
+  ; OFF-NEXT:     $r3 = ADD_add_r_ri killed $r3, -4, implicit-def $srcarry
+  ; OFF-NEXT:   }
+  ; OFF-NEXT:   BUNDLE implicit-def $p2, implicit killed $wl2, implicit killed $p2, implicit $r3 {
+  ; OFF-NEXT:     $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm killed $wl2, killed $p2, 64 :: (store (<8 x s32>), addrspace 7)
+  ; OFF-NEXT:     JNZ $r3, %bb.1
+  ; OFF-NEXT:   }
+  ; OFF-NEXT:   BUNDLE implicit-def $wl8, implicit-def $p1, implicit-def $wh1, implicit killed $p1, implicit $m1, implicit $wl0 {
+  ; OFF-NEXT:     $wl8, $p1 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6)
+  ; OFF-NEXT:     $wh1 = VMOV_mv_w $wl0
+  ; OFF-NEXT:   }
+  ; OFF-NEXT:   BUNDLE implicit-def $wl4, implicit-def $wl6, implicit-def $p0, implicit-def $wh7, implicit-def $cm2, implicit-def $bml2, implicit-def $amll2, implicit-def $amlh2, implicit-def $bmh2, implicit-def $amhl2, implicit-def $amhh2, implicit $p1, implicit killed $p0, implicit $m0, implicit $wl0, implicit $x6, implicit $x4, implicit $r0 {
+  ; OFF-NEXT:     $wl4 = VLDA_dmw_lda_w_ag_idx_imm $p1, 32 :: (load (<8 x s32>), addrspace 6)
+  ; OFF-NEXT:     $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5)
+  ; OFF-NEXT:     $wh7 = VMOV_mv_w $wl0
+  ; OFF-NEXT:     $cm2 = VMUL_vmac_cm_core_dense $x6, $x4, $r0
+  ; OFF-NEXT:   }
+  ; OFF-NEXT:   $wh3 = VMOV_mv_w $wl0
+  ; OFF-NEXT:   BUNDLE implicit-def $wl2, implicit-def $wh2, implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit $p0, implicit $wl0, implicit $x1, implicit killed $x10, implicit $r0 {
+  ; OFF-NEXT:     $wl2 = VLDB_dmw_ldb_ag_idx_imm $p0, 32 :: (load (<8 x s32>), addrspace 5)
+  ; OFF-NEXT:     $wh2 = VMOV_mv_w $wl0
+  ; OFF-NEXT:     $cm0 = VMUL_vmac_cm_core_dense $x1, killed $x10, $r0
+  ; OFF-NEXT:   }
+  ; OFF-NEXT:   BUNDLE implicit-def $wl10, implicit-def $p0, implicit-def $wl1, implicit-def $p1, implicit-def $wh10, implicit killed $p0, implicit $m0, implicit killed $p1, implicit $m1, implicit $wl0 {
+  ; OFF-NEXT:     $wl10, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm killed $p0, $m0 :: (load (<8 x s32>), addrspace 5)
+  ; OFF-NEXT:     $wl1, $p1 = VLDB_dmw_ldb_ag_pstm_nrm killed $p1, $m1 :: (load (<8 x s32>), addrspace 6)
+  ; OFF-NEXT:     $wh10 = VMOV_mv_w $wl0
+  ; OFF-NEXT:   }
+  ; OFF-NEXT:   DelayedSchedBarrier
+  ; OFF-NEXT: {{  $}}
+  ; OFF-NEXT: bb.2:
+  ; OFF-NEXT:   successors: %bb.3(0x80000000)
+  ; OFF-NEXT:   liveins: $r1, $r2
+  ; OFF-NEXT: {{  $}}
+  ; OFF-NEXT:   NOP
+  ; OFF-NEXT:   NOP
+  ; OFF-NEXT:   NOP
+  ; OFF-NEXT:   NOP
+  ; OFF-NEXT:   NOP
+  ; OFF-NEXT:   $r2 = OR killed $r2, killed $r1
+  ; OFF-NEXT: {{  $}}
+  ; OFF-NEXT: bb.3:
+  ; OFF-NEXT:   liveins: $r2
+  ; OFF-NEXT: {{  $}}
+  ; OFF-NEXT:   RET implicit $lr
+  ; OFF-NEXT:   NOP
+  ; OFF-NEXT:   NOP
+  ; OFF-NEXT:   NOP
+  ; OFF-NEXT:   NOP
+  ; OFF-NEXT:   NOP
+  ; OFF-NEXT:   DelayedSchedBarrier implicit killed $r2
+  bb.0:
+    liveins: $p0, $r0, $r1, $r2
+    successors: %bb.1
+  bb.1:
+    successors: %bb.1, %bb.2
+    liveins: $cm0, $cm1, $cm2, $m0, $m1, $p0, $p1, $p2, $r0, $r1, $r2, $r3, $s0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
+    $cm4 = VMUL_vmac_cm_core_dense $x3, $x5, $r0
+    $cm3 = VMUL_vmac_cm_core_dense $x7, $x9, $r0
+    $wh3 = VSRS_S8_S32_mv_w_srs $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+    $wl3 = VSRS_S8_S32_mv_w_srs $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+    VST_dmw_sts_w_ag_idx_imm $wh3, $p2, 32 :: (store (<8 x s32>), addrspace 7)
+    $cm1 = VMUL_vmac_cm_core_dense $x6, $x8, $r0
+    $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl3, $p2, 64 :: (store (<8 x s32>), addrspace 7)
+    $cm2 = VMUL_vmac_cm_core_dense $x2, $x4, $r0
+    $wh2 = VSRS_S8_S32_mv_w_srs $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+    $wl2 = VSRS_S8_S32_mv_w_srs $cm4, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+    VST_dmw_sts_w_ag_idx_imm $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7)
+    $cm0 = VMUL_vmac_cm_core_dense $x10, $x1, $r0
+    $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl2, $p2, 64 :: (store (<8 x s32>), addrspace 7)
+    $wl2 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6)
+    $wh4 = VSRS_S8_S32_mv_w_srs $cm3, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+    $wl4 = VSRS_S8_S32_mv_w_srs $cm1, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+    VST_dmw_sts_w_ag_idx_imm $wh4, $p2, 32 :: (store (<8 x s32>), addrspace 7)
+    $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl4, $p2, 64 :: (store (<8 x s32>), addrspace 7)
+    $wl4, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6)
+    $wl6 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5)
+    $wh6 = VMOV_mv_w $wl0
+    $cm1 = VMUL_vmac_cm_core_dense $x6, $x2, $r0
+    $wh2 = VSRS_S8_S32_mv_w_srs $cm2, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+    $wl2 = VSRS_S8_S32_mv_w_srs $cm0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd
+    $wl10 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6)
+    $wl6, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5)
+    $wl1 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5)
+    $wl5, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6)
+    $wl3, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5)
+    $wl9 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6)
+    $wl7 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5)
+    VST_dmw_sts_w_ag_idx_imm $wh2, $p2, 32 :: (store (<8 x s32>), addrspace 7)
+    $p2 = VST_dmw_sts_w_ag_pstm_nrm_imm $wl2, $p2, 64 :: (store (<8 x s32>), addrspace 7)
+    $cm2 = VMUL_vmac_cm_core_dense $x6, $x4, $r0
+    $wl8, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6)
+    $wl6, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5)
+    $wl2 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<8 x s32>), align 32, addrspace 5)
+    $wl4 = VLD_idx_imm_3x32_pseudo $p1, 32 :: (load (<8 x s32>), align 32, addrspace 6)
+    $wh1 = VMOV_mv_w $wl0
+    $cm0 = VMUL_vmac_cm_core_dense $x1, $x10, $r0
+    $wl10, $p0 = VLD_pstm_pseudo $p0, $m0 :: (load (<8 x s32>), align 32, addrspace 5)
+    $wl1, $p1 = VLD_pstm_pseudo $p1, $m1 :: (load (<8 x s32>), align 32, addrspace 6)
+    $r3 = ADD_add_r_ri $r3, -4, implicit-def $srcarry
+    $wh3 = VMOV_mv_w $wl0
+    $wh7 = VMOV_mv_w $wl0
+    $wh2 = VMOV_mv_w $wl0
+    $wh10 = VMOV_mv_w $wl0
+    JNZ $r3, %bb.1
+    DelayedSchedBarrier
+  bb.2:
+    liveins: $r1, $r2
+    successors: %bb.3
+    $r2 = OR $r2, $r1
+  bb.3:
+    liveins: $r2
+    RET implicit $lr
+    DelayedSchedBarrier implicit $r2
+...